diff --git a/.gitattributes b/.gitattributes index 7209fd38324399a0f5f0a6ba67e34084b48015c3..06f2f322bc2819a5d99da3c394bcb222f6d49144 100644 --- a/.gitattributes +++ b/.gitattributes @@ -540,3 +540,61 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text 4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text 4b284b12bc4seed1/evaluation/generation/examples.4b284b12bc4seed1_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_5.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_3.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl filter=lfs diff=lfs merge=lfs -text +4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl filter=lfs diff=lfs merge=lfs -text diff --git a/4b284b12bc4seed1/evaluation/generation/merged.csv b/4b284b12bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..cb0f5adc10b4086e702c7704bf94d6a15a786b0d --- /dev/null +++ b/4b284b12bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0046873222992475675 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0046873222992475675 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.10965942344261605 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.10965942344261605 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.13810992950827863 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.13810992950827863 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.15830106057751542 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.15830106057751542 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1643064184413658 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1643064184413658 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1672452517241617 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1672452517241617 +e2e_nlg_cleaned,5,average,multiple,0.12371823433219753 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.00907913529407834 +gem_xsum,0,median,rouge2_fmeasure,0.00907913529407834 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.009694349099296177 +gem_xsum,1,median,rouge2_fmeasure,0.009694349099296177 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.013095799671134395 +gem_xsum,2,median,rouge2_fmeasure,0.013095799671134395 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.014219210333358922 +gem_xsum,3,median,rouge2_fmeasure,0.014219210333358922 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.005260596389372223 +gem_xsum,4,median,rouge2_fmeasure,0.005260596389372223 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,4.1835752834372257e-05 +gem_xsum,5,median,rouge2_fmeasure,4.1835752834372257e-05 +gem_xsum,5,average,multiple,0.008565154423345739 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05709876588901533 +web_nlg_en,0,median,rouge2_fmeasure,0.05709876588901533 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.056707072109866384 +web_nlg_en,1,median,rouge2_fmeasure,0.056707072109866384 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.057190355206892265 +web_nlg_en,2,median,rouge2_fmeasure,0.057190355206892265 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05729780671036665 +web_nlg_en,3,median,rouge2_fmeasure,0.05729780671036665 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05717656346160751 +web_nlg_en,4,median,rouge2_fmeasure,0.05717656346160751 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.056226522227932986 +web_nlg_en,5,median,rouge2_fmeasure,0.056226522227932986 +web_nlg_en,5,average,multiple,0.05694951426761352 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.004231279339262943 +wiki_lingua_en,0,median,rouge2_fmeasure,0.004231279339262943 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.012970684860542986 +wiki_lingua_en,1,median,rouge2_fmeasure,0.012970684860542986 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.018558312186827172 +wiki_lingua_en,2,median,rouge2_fmeasure,0.018558312186827172 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.022809767017555965 +wiki_lingua_en,3,median,rouge2_fmeasure,0.022809767017555965 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009630515215905031 +wiki_lingua_en,4,median,rouge2_fmeasure,0.009630515215905031 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0015852585648270666 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0015852585648270666 +wiki_lingua_en,5,average,multiple,0.011630969530820195 diff --git a/4b284b12bc4seed1/evaluation/generation/merged.json b/4b284b12bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..ae16fbd3e0e399b4dad07924b5132db9e1f8ddb5 --- /dev/null +++ b/4b284b12bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4137289209321652, "bleu_stderr": 0.03592840698985395, "rouge1_fmeasure": 0.12202957876438714, "rouge1_fmeasure_stderr": 0.0020897173297206215, "rouge1_precision": 0.08052703538584219, "rouge1_precision_stderr": 0.0017032922927436723, "rouge1_recall": 0.34685731924417923, "rouge1_recall_stderr": 0.004696819146262294, "rouge2_fmeasure": 0.05709876588901533, "rouge2_fmeasure_stderr": 0.001338487535015282, "rouge2_precision": 0.0371963393448065, "rouge2_precision_stderr": 0.001000406080573449, "rouge2_recall": 0.16920772861048264, "rouge2_recall_stderr": 0.003352502998815072, "rougeL_fmeasure": 0.11638689693482958, "rougeL_fmeasure_stderr": 0.0018866066835855856, "rougeL_precision": 0.07642927515383495, "rougeL_precision_stderr": 0.0015283993241970184, "rougeL_recall": 0.33488912829275347, "rougeL_recall_stderr": 0.0045599664755635915, "rougeLsum_fmeasure": 0.11524865510935987, "rougeLsum_fmeasure_stderr": 0.001924801119419314, "rougeLsum_precision": 0.07604904600865044, "rougeLsum_precision_stderr": 0.0015809038296551435, "rougeLsum_recall": 0.32815010771859937, "rougeLsum_recall_stderr": 0.004354254520219142}}, "1": {"PALM_prompt": {"bleu": 0.44320235453851886, "bleu_stderr": 0.038654580038729125, "rouge1_fmeasure": 0.12119128966268897, "rouge1_fmeasure_stderr": 0.0020394652462786458, "rouge1_precision": 0.0788292625128184, "rouge1_precision_stderr": 0.001545865630724369, "rouge1_recall": 0.34898005560018197, "rouge1_recall_stderr": 0.004794912547747137, "rouge2_fmeasure": 0.056707072109866384, "rouge2_fmeasure_stderr": 0.001298880402066703, "rouge2_precision": 0.03674083737130746, "rouge2_precision_stderr": 0.0009509963850229752, "rouge2_recall": 0.17151623711827377, "rouge2_recall_stderr": 0.0034511631482392565, "rougeL_fmeasure": 0.1158600041349514, "rougeL_fmeasure_stderr": 0.0018669882568564582, "rougeL_precision": 0.07498774748969433, "rougeL_precision_stderr": 0.0013853305220861518, "rougeL_recall": 0.33741306095617796, "rougeL_recall_stderr": 0.004672233537657657, "rougeLsum_fmeasure": 0.11473200282386622, "rougeLsum_fmeasure_stderr": 0.0019120368338142416, "rougeLsum_precision": 0.07464447143195571, "rougeLsum_precision_stderr": 0.001449910621324083, "rougeLsum_recall": 0.3300443408931697, "rougeLsum_recall_stderr": 0.0044396729039993124}}, "2": {"PALM_prompt": {"bleu": 0.4318507139656118, "bleu_stderr": 0.020126652103215823, "rouge1_fmeasure": 0.1211597172909803, "rouge1_fmeasure_stderr": 0.002024707920166121, "rouge1_precision": 0.07846937247932836, "rouge1_precision_stderr": 0.0015076119882704358, "rouge1_recall": 0.3512872090522125, "rouge1_recall_stderr": 0.004843993030116799, "rouge2_fmeasure": 0.057190355206892265, "rouge2_fmeasure_stderr": 0.0012758800698632082, "rouge2_precision": 0.03689332195406738, "rouge2_precision_stderr": 0.000921539572592384, "rouge2_recall": 0.17407998451855428, "rouge2_recall_stderr": 0.003436093615166077, "rougeL_fmeasure": 0.11605588796630185, "rougeL_fmeasure_stderr": 0.0018703477264729452, "rougeL_precision": 0.07486170965273004, "rougeL_precision_stderr": 0.0013700867024842227, "rougeL_recall": 0.3400276434993588, "rougeL_recall_stderr": 0.004727704994934382, "rougeLsum_fmeasure": 0.1146567492990496, "rougeLsum_fmeasure_stderr": 0.0018940503662138908, "rougeLsum_precision": 0.07425974001821288, "rougeLsum_precision_stderr": 0.001410836040797904, "rougeLsum_recall": 0.33236868938870073, "rougeLsum_recall_stderr": 0.004471409789333774}}, "3": {"PALM_prompt": {"bleu": 0.437820931259575, "bleu_stderr": 0.032980906480770865, "rouge1_fmeasure": 0.12105811695552406, "rouge1_fmeasure_stderr": 0.002038435380001679, "rouge1_precision": 0.0785926975391468, "rouge1_precision_stderr": 0.0015293603423737205, "rouge1_recall": 0.3493939393936786, "rouge1_recall_stderr": 0.004787694752084967, "rouge2_fmeasure": 0.05729780671036665, "rouge2_fmeasure_stderr": 0.0012917575350816215, "rouge2_precision": 0.03703345126698836, "rouge2_precision_stderr": 0.0009394498824059976, "rouge2_recall": 0.17399497492711738, "rouge2_recall_stderr": 0.0034376201679732726, "rougeL_fmeasure": 0.11550661899372622, "rougeL_fmeasure_stderr": 0.001880977056199583, "rougeL_precision": 0.07468535679699133, "rougeL_precision_stderr": 0.0013893206714726519, "rougeL_recall": 0.33714313164357695, "rougeL_recall_stderr": 0.004665286047497972, "rougeLsum_fmeasure": 0.11457534646291556, "rougeLsum_fmeasure_stderr": 0.0019112644015919146, "rougeLsum_precision": 0.07441958215060634, "rougeLsum_precision_stderr": 0.0014367820161710976, "rougeLsum_recall": 0.3308931008083881, "rougeLsum_recall_stderr": 0.004444824580452303}}, "4": {"PALM_prompt": {"bleu": 0.44162298156671387, "bleu_stderr": 0.03391466255022238, "rouge1_fmeasure": 0.12061836300623924, "rouge1_fmeasure_stderr": 0.0019968483074851587, "rouge1_precision": 0.07809208129692338, "rouge1_precision_stderr": 0.0014897473758506577, "rouge1_recall": 0.3509704340756107, "rouge1_recall_stderr": 0.004753376988143335, "rouge2_fmeasure": 0.05717656346160751, "rouge2_fmeasure_stderr": 0.001268286668919329, "rouge2_precision": 0.03681990692658017, "rouge2_precision_stderr": 0.000915025627561909, "rouge2_recall": 0.17569168523985457, "rouge2_recall_stderr": 0.003469639579874031, "rougeL_fmeasure": 0.11522040390086466, "rougeL_fmeasure_stderr": 0.0018477157862829916, "rougeL_precision": 0.07430240518062578, "rougeL_precision_stderr": 0.001356662158276597, "rougeL_recall": 0.33848067082117866, "rougeL_recall_stderr": 0.004630272763143682, "rougeLsum_fmeasure": 0.11384094914867832, "rougeLsum_fmeasure_stderr": 0.0018639383740272559, "rougeLsum_precision": 0.0737119127298611, "rougeLsum_precision_stderr": 0.0013930977884493516, "rougeLsum_recall": 0.33180763273757674, "rougeLsum_recall_stderr": 0.0044219813999489994}}, "5": {"PALM_prompt": {"bleu": 0.43584026292753897, "bleu_stderr": 0.03585233394325135, "rouge1_fmeasure": 0.11885306502412998, "rouge1_fmeasure_stderr": 0.0020096665427829846, "rouge1_precision": 0.0771055828854556, "rouge1_precision_stderr": 0.001542950246478603, "rouge1_recall": 0.3485317236937699, "rouge1_recall_stderr": 0.004899363014116322, "rouge2_fmeasure": 0.056226522227932986, "rouge2_fmeasure_stderr": 0.0012937028286243123, "rouge2_precision": 0.03635257403197826, "rouge2_precision_stderr": 0.0009726184426252793, "rouge2_recall": 0.17491992097726822, "rouge2_recall_stderr": 0.003638732362333741, "rougeL_fmeasure": 0.11305367539046324, "rougeL_fmeasure_stderr": 0.0018447783930060325, "rougeL_precision": 0.07302888437056451, "rougeL_precision_stderr": 0.0013873752417405871, "rougeL_recall": 0.3349645978185685, "rougeL_recall_stderr": 0.004754512380009764, "rougeLsum_fmeasure": 0.11195050334054779, "rougeLsum_fmeasure_stderr": 0.001866391249607523, "rougeLsum_precision": 0.07261259826097624, "rougeLsum_precision_stderr": 0.001427137555756464, "rougeLsum_recall": 0.3288098925316197, "rougeLsum_recall_stderr": 0.004531291754719013}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.049986883599927444, "bleu_stderr": 0.007715281553787714, "rouge1_fmeasure": 0.09138645256779954, "rouge1_fmeasure_stderr": 0.001016337895874407, "rouge1_precision": 0.082701062500984, "rouge1_precision_stderr": 0.0011114690924738965, "rouge1_recall": 0.12060845181501477, "rouge1_recall_stderr": 0.001294419304833145, "rouge2_fmeasure": 0.004231279339262943, "rouge2_fmeasure_stderr": 0.00021788523328651946, "rouge2_precision": 0.004221824477192578, "rouge2_precision_stderr": 0.00022695165818508478, "rouge2_recall": 0.005013826064972669, "rouge2_recall_stderr": 0.0002886749356995342, "rougeL_fmeasure": 0.08450530803702978, "rougeL_fmeasure_stderr": 0.000901554006313133, "rougeL_precision": 0.07584986609826198, "rougeL_precision_stderr": 0.0009625240871993999, "rougeL_recall": 0.11285337234897329, "rougeL_recall_stderr": 0.0012181187217990192, "rougeLsum_fmeasure": 0.08182685699093503, "rougeLsum_fmeasure_stderr": 0.0008824530614132399, "rougeLsum_precision": 0.07385235192113547, "rougeLsum_precision_stderr": 0.0009724143584799478, "rougeLsum_recall": 0.10890947711749421, "rougeLsum_recall_stderr": 0.001166359192980158}}, "1": {"tldr_en": {"bleu": 0.6846935681911303, "bleu_stderr": 0.048520060976790166, "rouge1_fmeasure": 0.1243815766776328, "rouge1_fmeasure_stderr": 0.0015555535464807611, "rouge1_precision": 0.10817533987287839, "rouge1_precision_stderr": 0.001557158831062884, "rouge1_recall": 0.17611373045605921, "rouge1_recall_stderr": 0.0022577581661268067, "rouge2_fmeasure": 0.012970684860542986, "rouge2_fmeasure_stderr": 0.000559978352567754, "rouge2_precision": 0.011131441628753095, "rouge2_precision_stderr": 0.0004968420013755639, "rouge2_recall": 0.01944327988697502, "rouge2_recall_stderr": 0.0009397177797597168, "rougeL_fmeasure": 0.10222393170091604, "rougeL_fmeasure_stderr": 0.001072599197742085, "rougeL_precision": 0.08784703908570092, "rougeL_precision_stderr": 0.0010678467138718758, "rougeL_recall": 0.14780695365208887, "rougeL_recall_stderr": 0.0017330218043806387, "rougeLsum_fmeasure": 0.11629483720068202, "rougeLsum_fmeasure_stderr": 0.0014424904565344696, "rougeLsum_precision": 0.10103160064031973, "rougeLsum_precision_stderr": 0.0014463815800196036, "rougeLsum_recall": 0.16516729505216035, "rougeLsum_recall_stderr": 0.0021165463983442577}}, "2": {"tldr_en": {"bleu": 0.924456967620158, "bleu_stderr": 0.03559680289475058, "rouge1_fmeasure": 0.13555617621093394, "rouge1_fmeasure_stderr": 0.0016918864606359741, "rouge1_precision": 0.11722745375161758, "rouge1_precision_stderr": 0.0016710911856635707, "rouge1_recall": 0.19369245188326673, "rouge1_recall_stderr": 0.0024549618061264467, "rouge2_fmeasure": 0.018558312186827172, "rouge2_fmeasure_stderr": 0.0006667508176160621, "rouge2_precision": 0.015714560769775038, "rouge2_precision_stderr": 0.0005800745400016176, "rouge2_recall": 0.028401953794142447, "rouge2_recall_stderr": 0.0011788887815516438, "rougeL_fmeasure": 0.11035639324221315, "rougeL_fmeasure_stderr": 0.00116330079987278, "rougeL_precision": 0.09440630375692156, "rougeL_precision_stderr": 0.0011423074011562776, "rougeL_recall": 0.16112301920239752, "rougeL_recall_stderr": 0.0019113881872497018, "rougeLsum_fmeasure": 0.12597857991599187, "rougeLsum_fmeasure_stderr": 0.0015636832895239018, "rougeLsum_precision": 0.10875530149178837, "rougeLsum_precision_stderr": 0.001540487913238783, "rougeLsum_recall": 0.18075066689057118, "rougeLsum_recall_stderr": 0.002307538960280241}}, "3": {"tldr_en": {"bleu": 1.3233423451404336, "bleu_stderr": 0.05997841498906398, "rouge1_fmeasure": 0.13144771127478205, "rouge1_fmeasure_stderr": 0.0019737365577617576, "rouge1_precision": 0.11737973030479486, "rouge1_precision_stderr": 0.0020068624954179474, "rouge1_recall": 0.190625289714194, "rouge1_recall_stderr": 0.0029906981571119883, "rouge2_fmeasure": 0.022809767017555965, "rouge2_fmeasure_stderr": 0.0007411221039709219, "rouge2_precision": 0.019855590629909, "rouge2_precision_stderr": 0.0006805588907016732, "rouge2_recall": 0.035136977735741294, "rouge2_recall_stderr": 0.0012979131621517655, "rougeL_fmeasure": 0.10364321400933231, "rougeL_fmeasure_stderr": 0.0014077752246067334, "rougeL_precision": 0.09211895227457069, "rougeL_precision_stderr": 0.001463335123058735, "rougeL_recall": 0.1532140325253042, "rougeL_recall_stderr": 0.0023289864471671166, "rougeLsum_fmeasure": 0.12201579783859283, "rougeLsum_fmeasure_stderr": 0.0018234148242418614, "rougeLsum_precision": 0.10903674502952361, "rougeLsum_precision_stderr": 0.0018618317785407703, "rougeLsum_recall": 0.17719145160862762, "rougeLsum_recall_stderr": 0.002790477962341549}}, "4": {"tldr_en": {"bleu": 0.3613016066707259, "bleu_stderr": 0.036915701681708164, "rouge1_fmeasure": 0.047822320448517586, "rouge1_fmeasure_stderr": 0.0017110762026540103, "rouge1_precision": 0.04521455263428701, "rouge1_precision_stderr": 0.001817051799470601, "rouge1_recall": 0.07167001920005753, "rouge1_recall_stderr": 0.0026297817119641954, "rouge2_fmeasure": 0.009630515215905031, "rouge2_fmeasure_stderr": 0.0005703572936436381, "rouge2_precision": 0.008537546793712486, "rouge2_precision_stderr": 0.0005451212697125958, "rouge2_recall": 0.015535161953539139, "rouge2_recall_stderr": 0.0010082854476503452, "rougeL_fmeasure": 0.037591797438144656, "rougeL_fmeasure_stderr": 0.0012882375080276282, "rougeL_precision": 0.03581845161408539, "rougeL_precision_stderr": 0.001454348764904071, "rougeL_recall": 0.05736959242350815, "rougeL_recall_stderr": 0.0020694778242817827, "rougeLsum_fmeasure": 0.04440192076877557, "rougeLsum_fmeasure_stderr": 0.0015922060897516483, "rougeLsum_precision": 0.04226563254674496, "rougeLsum_precision_stderr": 0.001724425679642684, "rougeLsum_recall": 0.0664662102163325, "rougeLsum_recall_stderr": 0.0024427933119544}}, "5": {"tldr_en": {"bleu": 3.525168786175522e-07, "bleu_stderr": 5.612162101464965e-07, "rouge1_fmeasure": 0.00773346326426295, "rouge1_fmeasure_stderr": 0.0007772828239978309, "rouge1_precision": 0.007307470265602752, "rouge1_precision_stderr": 0.0007897900183615145, "rouge1_recall": 0.011510813177556005, "rouge1_recall_stderr": 0.0011552914746690388, "rouge2_fmeasure": 0.0015852585648270666, "rouge2_fmeasure_stderr": 0.00022806743110909383, "rouge2_precision": 0.001429085231861359, "rouge2_precision_stderr": 0.00021883906441152553, "rouge2_recall": 0.002570423991576332, "rouge2_recall_stderr": 0.00046790313160452587, "rougeL_fmeasure": 0.005728100957946406, "rougeL_fmeasure_stderr": 0.0005516850094907344, "rougeL_precision": 0.005352988837771441, "rougeL_precision_stderr": 0.0005537181118288242, "rougeL_recall": 0.00878014362052647, "rougeL_recall_stderr": 0.0008837201326346826, "rougeLsum_fmeasure": 0.007108428092699879, "rougeLsum_fmeasure_stderr": 0.0007115175993630998, "rougeLsum_precision": 0.006777901778057594, "rougeLsum_precision_stderr": 0.0007374290911612345, "rougeLsum_recall": 0.010579350524900687, "rougeLsum_recall_stderr": 0.0010634321336190457}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.043151087658762945, "bleu_stderr": 0.0032342340375698313, "rouge1_fmeasure": 0.08500629167730897, "rouge1_fmeasure_stderr": 0.0006989135591713463, "rouge1_precision": 0.06495149327905961, "rouge1_precision_stderr": 0.0006154921660754954, "rouge1_recall": 0.13370167404751163, "rouge1_recall_stderr": 0.0010189486890049794, "rouge2_fmeasure": 0.0046873222992475675, "rouge2_fmeasure_stderr": 0.0002022412956316276, "rouge2_precision": 0.0035303832289245198, "rouge2_precision_stderr": 0.0001527456230237433, "rouge2_recall": 0.007319549034551619, "rouge2_recall_stderr": 0.00032488894186680385, "rougeL_fmeasure": 0.08337201239477712, "rougeL_fmeasure_stderr": 0.000651472451774746, "rougeL_precision": 0.06359934650707177, "rougeL_precision_stderr": 0.0005723430187186213, "rougeL_recall": 0.13157140403081988, "rougeL_recall_stderr": 0.0009812226045655013, "rougeLsum_fmeasure": 0.07193972974334535, "rougeLsum_fmeasure_stderr": 0.0005686183231544299, "rougeLsum_precision": 0.05504912687523559, "rougeLsum_precision_stderr": 0.0005196674711591116, "rougeLsum_recall": 0.11355464831908403, "rougeLsum_recall_stderr": 0.0008466893831039181}}, "1": {"generate_text_restaurant": {"bleu": 5.257466769424374, "bleu_stderr": 0.07357547724601604, "rouge1_fmeasure": 0.28148883433290234, "rouge1_fmeasure_stderr": 0.001923209524922163, "rouge1_precision": 0.23786938588769446, "rouge1_precision_stderr": 0.002104681875527956, "rouge1_recall": 0.4121530298560502, "rouge1_recall_stderr": 0.0030089320928001376, "rouge2_fmeasure": 0.10965942344261605, "rouge2_fmeasure_stderr": 0.001266910542618815, "rouge2_precision": 0.09021145181730728, "rouge2_precision_stderr": 0.0012485251931366231, "rouge2_recall": 0.16603227142774246, "rouge2_recall_stderr": 0.002026618225842661, "rougeL_fmeasure": 0.23105487418644294, "rougeL_fmeasure_stderr": 0.0014408012821073104, "rougeL_precision": 0.193582677648821, "rougeL_precision_stderr": 0.001580931677078503, "rougeL_recall": 0.34417212574068223, "rougeL_recall_stderr": 0.0025977918681195948, "rougeLsum_fmeasure": 0.22905909984905182, "rougeLsum_fmeasure_stderr": 0.0017314569916895598, "rougeLsum_precision": 0.19491877857913123, "rougeLsum_precision_stderr": 0.001917333394119044, "rougeLsum_recall": 0.3349539368345205, "rougeLsum_recall_stderr": 0.0027000371649433153}}, "2": {"generate_text_restaurant": {"bleu": 6.5539390861720035, "bleu_stderr": 0.11033075795685604, "rouge1_fmeasure": 0.32505406392907477, "rouge1_fmeasure_stderr": 0.0019673064297673175, "rouge1_precision": 0.2903404594422581, "rouge1_precision_stderr": 0.0024584696959761548, "rouge1_recall": 0.4333135279478913, "rouge1_recall_stderr": 0.002735582379776765, "rouge2_fmeasure": 0.13810992950827863, "rouge2_fmeasure_stderr": 0.001444581219830018, "rouge2_precision": 0.12271979460763294, "rouge2_precision_stderr": 0.0015962080243060197, "rouge2_recall": 0.1876592718803383, "rouge2_recall_stderr": 0.002021534025159006, "rougeL_fmeasure": 0.2534760790754178, "rougeL_fmeasure_stderr": 0.0014777484527935105, "rougeL_precision": 0.22365724489731642, "rougeL_precision_stderr": 0.0018056307126048264, "rougeL_recall": 0.345401861303691, "rougeL_recall_stderr": 0.0024446936487181117, "rougeLsum_fmeasure": 0.2673001651581832, "rougeLsum_fmeasure_stderr": 0.0018598623220801944, "rougeLsum_precision": 0.2395654073443738, "rougeLsum_precision_stderr": 0.00223173839964408, "rougeLsum_recall": 0.35566140454139644, "rougeLsum_recall_stderr": 0.0025565410886410314}}, "3": {"generate_text_restaurant": {"bleu": 7.802031385100445, "bleu_stderr": 0.12520230313336564, "rouge1_fmeasure": 0.3574815420785405, "rouge1_fmeasure_stderr": 0.0020170776348172677, "rouge1_precision": 0.3338557482843276, "rouge1_precision_stderr": 0.002551971432391476, "rouge1_recall": 0.438621126600984, "rouge1_recall_stderr": 0.0026752980677359345, "rouge2_fmeasure": 0.15830106057751542, "rouge2_fmeasure_stderr": 0.001543718964396758, "rouge2_precision": 0.14777645169229991, "rouge2_precision_stderr": 0.001697278469314969, "rouge2_recall": 0.1968122332214139, "rouge2_recall_stderr": 0.0020328947895924005, "rougeL_fmeasure": 0.26474191187216406, "rougeL_fmeasure_stderr": 0.0015211355756810816, "rougeL_precision": 0.24519327081461934, "rougeL_precision_stderr": 0.0018824234987225013, "rougeL_recall": 0.33120052143267326, "rougeL_recall_stderr": 0.002370647416558713, "rougeLsum_fmeasure": 0.29524890844717644, "rougeLsum_fmeasure_stderr": 0.0019147519696928641, "rougeLsum_precision": 0.2764593808821877, "rougeLsum_precision_stderr": 0.002325889733406939, "rougeLsum_recall": 0.36160015573781135, "rougeLsum_recall_stderr": 0.00249846394269229}}, "4": {"generate_text_restaurant": {"bleu": 8.382191502210562, "bleu_stderr": 0.14919541969359718, "rouge1_fmeasure": 0.37167375204035885, "rouge1_fmeasure_stderr": 0.001958486588378226, "rouge1_precision": 0.35487550391649403, "rouge1_precision_stderr": 0.0025344302739034657, "rouge1_recall": 0.4361438966782426, "rouge1_recall_stderr": 0.002544026410687206, "rouge2_fmeasure": 0.1643064184413658, "rouge2_fmeasure_stderr": 0.0015388346414606316, "rouge2_precision": 0.15717469519815827, "rouge2_precision_stderr": 0.0017315413364928619, "rouge2_recall": 0.19553331645690575, "rouge2_recall_stderr": 0.001973839383704276, "rougeL_fmeasure": 0.269902657132463, "rougeL_fmeasure_stderr": 0.001559683698964203, "rougeL_precision": 0.2558302815927228, "rougeL_precision_stderr": 0.0019085151038414955, "rougeL_recall": 0.3217813559039779, "rougeL_recall_stderr": 0.002287943298156401, "rougeLsum_fmeasure": 0.3079430786135033, "rougeLsum_fmeasure_stderr": 0.0018787126708384577, "rougeLsum_precision": 0.29398198190868147, "rougeLsum_precision_stderr": 0.0022912359575795035, "rougeLsum_recall": 0.3619119446078494, "rougeLsum_recall_stderr": 0.0024413151060046338}}, "5": {"generate_text_restaurant": {"bleu": 8.434096593422941, "bleu_stderr": 0.12374952402155545, "rouge1_fmeasure": 0.3784865425574256, "rouge1_fmeasure_stderr": 0.0019280406579030697, "rouge1_precision": 0.3663947022200451, "rouge1_precision_stderr": 0.0025940276035361334, "rouge1_recall": 0.4374444751473449, "rouge1_recall_stderr": 0.002477034708943807, "rouge2_fmeasure": 0.1672452517241617, "rouge2_fmeasure_stderr": 0.0015293774465050592, "rouge2_precision": 0.1621778484306982, "rouge2_precision_stderr": 0.0017550698361701628, "rouge2_recall": 0.19557321714564072, "rouge2_recall_stderr": 0.001918438736692879, "rougeL_fmeasure": 0.27416374665856746, "rougeL_fmeasure_stderr": 0.0015606439952248043, "rougeL_precision": 0.2631916594457156, "rougeL_precision_stderr": 0.0019344484995491348, "rougeL_recall": 0.32194602376614945, "rougeL_recall_stderr": 0.0022606742323408495, "rougeLsum_fmeasure": 0.3146193040515442, "rougeLsum_fmeasure_stderr": 0.0018454927016279012, "rougeLsum_precision": 0.30450332346607173, "rougeLsum_precision_stderr": 0.002324281011297584, "rougeLsum_recall": 0.36375181614652935, "rougeLsum_recall_stderr": 0.002343908164411574}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.2362860989119941, "bleu_stderr": 0.06515733012952077, "rouge1_fmeasure": 0.11289013583650749, "rouge1_fmeasure_stderr": 0.0015241960402614591, "rouge1_precision": 0.08342489348341924, "rouge1_precision_stderr": 0.0012842607244636889, "rouge1_recall": 0.18313462458030272, "rouge1_recall_stderr": 0.0022833897865371204, "rouge2_fmeasure": 0.00907913529407834, "rouge2_fmeasure_stderr": 0.0005360706812311225, "rouge2_precision": 0.006807640108939664, "rouge2_precision_stderr": 0.0004546616697105076, "rouge2_recall": 0.014646684932925907, "rouge2_recall_stderr": 0.0008582893632378247, "rougeL_fmeasure": 0.09604887262918436, "rougeL_fmeasure_stderr": 0.0012450372296769945, "rougeL_precision": 0.07075720554200345, "rougeL_precision_stderr": 0.0010438807547277764, "rougeL_recall": 0.1570230345629149, "rougeL_recall_stderr": 0.001966731475785525, "rougeLsum_fmeasure": 0.09369805182267531, "rougeLsum_fmeasure_stderr": 0.0012217813043505867, "rougeLsum_precision": 0.06903402748086229, "rougeLsum_precision_stderr": 0.0010262159629070258, "rougeLsum_recall": 0.15315001526257727, "rougeLsum_recall_stderr": 0.00192242995222989}}, "1": {"article_DOC_summary": {"bleu": 0.37834186143792814, "bleu_stderr": 0.07579632209521951, "rouge1_fmeasure": 0.09612718872859236, "rouge1_fmeasure_stderr": 0.0016940816359247253, "rouge1_precision": 0.0685738400230712, "rouge1_precision_stderr": 0.0012614794489907162, "rouge1_recall": 0.16764066671392627, "rouge1_recall_stderr": 0.0028556162454937625, "rouge2_fmeasure": 0.009694349099296177, "rouge2_fmeasure_stderr": 0.0007117075020582719, "rouge2_precision": 0.00687822694283814, "rouge2_precision_stderr": 0.0005061346255521965, "rouge2_recall": 0.017217988643461746, "rouge2_recall_stderr": 0.001286522520281308, "rougeL_fmeasure": 0.08667789904325363, "rougeL_fmeasure_stderr": 0.0014098828990377955, "rougeL_precision": 0.06179089290304659, "rougeL_precision_stderr": 0.001050543101974293, "rougeL_recall": 0.15154479594716758, "rougeL_recall_stderr": 0.0024146601749356305, "rougeLsum_fmeasure": 0.08238177966767227, "rougeLsum_fmeasure_stderr": 0.0013689020999623559, "rougeLsum_precision": 0.058639097301684553, "rougeLsum_precision_stderr": 0.0010116604269926539, "rougeLsum_recall": 0.14457668621942818, "rougeLsum_recall_stderr": 0.0023919173520257917}}, "2": {"article_DOC_summary": {"bleu": 0.4849143382987017, "bleu_stderr": 0.04905432423972044, "rouge1_fmeasure": 0.10437600774672044, "rouge1_fmeasure_stderr": 0.00207952100569553, "rouge1_precision": 0.07431434923410046, "rouge1_precision_stderr": 0.0015283612791276602, "rouge1_recall": 0.1827918011387138, "rouge1_recall_stderr": 0.003586036642211407, "rouge2_fmeasure": 0.013095799671134395, "rouge2_fmeasure_stderr": 0.0008489058427507967, "rouge2_precision": 0.009246404787084099, "rouge2_precision_stderr": 0.0006013153673346344, "rouge2_recall": 0.02349062031177244, "rouge2_recall_stderr": 0.001548636822631295, "rougeL_fmeasure": 0.09226827959916255, "rougeL_fmeasure_stderr": 0.0016450494129720853, "rougeL_precision": 0.06564873824136315, "rougeL_precision_stderr": 0.0012078804697379189, "rougeL_recall": 0.16191451646036112, "rougeL_recall_stderr": 0.002884321850290769, "rougeLsum_fmeasure": 0.08811439184894636, "rougeLsum_fmeasure_stderr": 0.001657196039152884, "rougeLsum_precision": 0.06260025314838306, "rougeLsum_precision_stderr": 0.0012121969148993826, "rougeLsum_recall": 0.15522785504407458, "rougeLsum_recall_stderr": 0.002921032689214908}}, "3": {"article_DOC_summary": {"bleu": 0.5746056304031278, "bleu_stderr": 0.03416800265114781, "rouge1_fmeasure": 0.10584203948827893, "rouge1_fmeasure_stderr": 0.0022277495525711575, "rouge1_precision": 0.07801541338302878, "rouge1_precision_stderr": 0.0018173640538940942, "rouge1_recall": 0.18154402515522824, "rouge1_recall_stderr": 0.003824785983248113, "rouge2_fmeasure": 0.014219210333358922, "rouge2_fmeasure_stderr": 0.0009311822269523914, "rouge2_precision": 0.01034600818070244, "rouge2_precision_stderr": 0.0007057473078572547, "rouge2_recall": 0.024986150502265235, "rouge2_recall_stderr": 0.001692210317983424, "rougeL_fmeasure": 0.09228122264940566, "rougeL_fmeasure_stderr": 0.001790178384835284, "rougeL_precision": 0.0678536492661622, "rougeL_precision_stderr": 0.0014739423841839488, "rougeL_recall": 0.1585838156145662, "rougeL_recall_stderr": 0.0030943555602493257, "rougeLsum_fmeasure": 0.08945125770061474, "rougeLsum_fmeasure_stderr": 0.0018164527031048951, "rougeLsum_precision": 0.06570229340369058, "rougeLsum_precision_stderr": 0.0014813914622807897, "rougeLsum_recall": 0.1543154445584897, "rougeLsum_recall_stderr": 0.0031794372645773186}}, "4": {"article_DOC_summary": {"bleu": 0.4651237596229011, "bleu_stderr": 0.11156128095981639, "rouge1_fmeasure": 0.03399511479025987, "rouge1_fmeasure_stderr": 0.0020729464765277573, "rouge1_precision": 0.03064279607658244, "rouge1_precision_stderr": 0.0022965807355317283, "rouge1_recall": 0.05195082603025642, "rouge1_recall_stderr": 0.003220539005393162, "rouge2_fmeasure": 0.005260596389372223, "rouge2_fmeasure_stderr": 0.0006214146366789691, "rouge2_precision": 0.004284009012383848, "rouge2_precision_stderr": 0.0005485208167059648, "rouge2_recall": 0.008382344530163326, "rouge2_recall_stderr": 0.001000201966734194, "rougeL_fmeasure": 0.028356576842958853, "rougeL_fmeasure_stderr": 0.0016783975883159549, "rougeL_precision": 0.025526009580683906, "rougeL_precision_stderr": 0.0018894407281977873, "rougeL_recall": 0.04350947253111104, "rougeL_recall_stderr": 0.002619339324653331, "rougeLsum_fmeasure": 0.02809770605845126, "rougeLsum_fmeasure_stderr": 0.0016857742426147392, "rougeLsum_precision": 0.025412397431441656, "rougeLsum_precision_stderr": 0.0019050756306301015, "rougeLsum_recall": 0.043137551559700685, "rougeLsum_recall_stderr": 0.0026330951282367504}}, "5": {"article_DOC_summary": {"bleu": 1.0502876017100983e-40, "bleu_stderr": 9.258629322298703e-36, "rouge1_fmeasure": 0.001878489668702617, "rouge1_fmeasure_stderr": 0.0005292148091180053, "rouge1_precision": 0.0021504887939511347, "rouge1_precision_stderr": 0.0006247191105487891, "rouge1_recall": 0.001777814575528098, "rouge1_recall_stderr": 0.0005039303650046605, "rouge2_fmeasure": 4.1835752834372257e-05, "rouge2_fmeasure_stderr": 4.183575283437121e-05, "rouge2_precision": 5.717552887364208e-05, "rouge2_precision_stderr": 5.7175528873642526e-05, "rouge2_recall": 3.298588204248582e-05, "rouge2_recall_stderr": 3.298588204248464e-05, "rougeL_fmeasure": 0.0016730266584955929, "rougeL_fmeasure_stderr": 0.00046229330330986443, "rougeL_precision": 0.0018915873488676697, "rougeL_precision_stderr": 0.0005354926007169597, "rougeL_recall": 0.0016022233245543466, "rougeL_recall_stderr": 0.00045106425429012806, "rougeLsum_fmeasure": 0.0016122575588416806, "rougeLsum_fmeasure_stderr": 0.00045030733547815735, "rougeLsum_precision": 0.0018136207185854307, "rougeLsum_precision_stderr": 0.0005184093493039095, "rougeLsum_recall": 0.0015526690795628231, "rougeLsum_recall_stderr": 0.0004429788793808031}}}} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..fd47de5c918737b5ca697a9226f68e1e5d20cb0c --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732967,0 +anli_r2,acc,0.325,0.014818724459095527,0 +anli_r3,acc,0.3441666666666667,0.013720551062295756,0 +arc_challenge,acc,0.26023890784982934,0.012821930225112568,0 +arc_challenge,acc_norm,0.2790102389078498,0.01310678488360133,0 +arc_easy,acc,0.5660774410774411,0.010169795770462111,0 +arc_easy,acc_norm,0.5084175084175084,0.010258329515226459,0 +boolq,acc,0.591131498470948,0.008598573693259106,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.1940928270042194,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.46415056761601275,0.0049769393332400776,0 +hellaswag,acc_norm,0.6052579167496515,0.0048779626449918555,0 +piqa,acc,0.7404787812840044,0.01022793988817392,0 +piqa,acc_norm,0.7431991294885746,0.01019286480227804,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.829,0.011912216456264607,0 +sciq,acc_norm,0.751,0.013681600278702301,0 +storycloze_2016,acc,0.7151256012827365,0.010437513986611718,0 +winogrande,acc,0.5824782951854776,0.013859978264440251,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0_lm-eval_global_step80108_2023-02-25-09-56-03_0shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0_lm-eval_global_step80108_2023-02-25-09-56-03_0shots_backup.json deleted file mode 100644 index e79490b678ff52ad5bb5d7467f513c8a24b345d6..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_0_lm-eval_global_step80108_2023-02-25-09-56-03_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732967 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095527 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295756 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.1940928270042194 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.46415056761601275, - "acc_stderr": 0.0049769393332400776, - "acc_norm": 0.6052579167496515, - "acc_norm_stderr": 0.0048779626449918555 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.5824782951854776, - "acc_stderr": 0.013859978264440251 - }, - "storycloze_2016": { - "acc": 0.7151256012827365, - "acc_stderr": 0.010437513986611718 - }, - "boolq": { - "acc": 0.591131498470948, - "acc_stderr": 0.008598573693259106 - }, - "arc_easy": { - "acc": 0.5660774410774411, - "acc_stderr": 0.010169795770462111, - "acc_norm": 0.5084175084175084, - "acc_norm_stderr": 0.010258329515226459 - }, - "arc_challenge": { - "acc": 0.26023890784982934, - "acc_stderr": 0.012821930225112568, - "acc_norm": 0.2790102389078498, - "acc_norm_stderr": 0.01310678488360133 - }, - "sciq": { - "acc": 0.829, - "acc_stderr": 0.011912216456264607, - "acc_norm": 0.751, - "acc_norm_stderr": 0.013681600278702301 - }, - "piqa": { - "acc": 0.7404787812840044, - "acc_stderr": 0.01022793988817392, - "acc_norm": 0.7431991294885746, - "acc_norm_stderr": 0.01019286480227804 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..e9af64a197eb754feebc028d5f4795335fc906c1 --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620342,0 +anli_r2,acc,0.323,0.014794927843348644,0 +anli_r3,acc,0.3441666666666667,0.013720551062295756,0 +arc_challenge,acc,0.2721843003412969,0.013006600406423706,0 +arc_challenge,acc_norm,0.3037542662116041,0.013438909184778764,0 +arc_easy,acc,0.6056397306397306,0.010028176038393004,0 +arc_easy,acc_norm,0.5606060606060606,0.010184134315437663,0 +boolq,acc,0.5773700305810398,0.008639722698719023,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3261261261261261,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.4643497311292571,0.004977081808179424,0 +hellaswag,acc_norm,0.6074487153953396,0.004873203269366301,0 +piqa,acc,0.7535364526659413,0.010054810789671824,0 +piqa,acc_norm,0.7595212187159956,0.009971345364651068,0 +rte,acc,0.5270758122743683,0.030052303463143706,0 +sciq,acc,0.847,0.01138950045966553,0 +sciq,acc_norm,0.792,0.012841374572096928,0 +storycloze_2016,acc,0.7129877071084981,0.010460934115933261,0 +winogrande,acc,0.5777426992896606,0.013881582030658549,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1_lm-eval_global_step80108_2023-02-25-09-56-03_1shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1_lm-eval_global_step80108_2023-02-25-09-56-03_1shots_backup.json deleted file mode 100644 index 6bd0932431dc7c298c53127b475e844b3636a0dd..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_1_lm-eval_global_step80108_2023-02-25-09-56-03_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620342 - }, - "anli_r2": { - "acc": 0.323, - "acc_stderr": 0.014794927843348644 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295756 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3261261261261261 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.4643497311292571, - "acc_stderr": 0.004977081808179424, - "acc_norm": 0.6074487153953396, - "acc_norm_stderr": 0.004873203269366301 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.5777426992896606, - "acc_stderr": 0.013881582030658549 - }, - "storycloze_2016": { - "acc": 0.7129877071084981, - "acc_stderr": 0.010460934115933261 - }, - "boolq": { - "acc": 0.5773700305810398, - "acc_stderr": 0.008639722698719023 - }, - "arc_easy": { - "acc": 0.6056397306397306, - "acc_stderr": 0.010028176038393004, - "acc_norm": 0.5606060606060606, - "acc_norm_stderr": 0.010184134315437663 - }, - "arc_challenge": { - "acc": 0.2721843003412969, - "acc_stderr": 0.013006600406423706, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.013438909184778764 - }, - "sciq": { - "acc": 0.847, - "acc_stderr": 0.01138950045966553, - "acc_norm": 0.792, - "acc_norm_stderr": 0.012841374572096928 - }, - "piqa": { - "acc": 0.7535364526659413, - "acc_stderr": 0.010054810789671824, - "acc_norm": 0.7595212187159956, - "acc_norm_stderr": 0.009971345364651068 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..8c980af60cd99be463bb13ae0caee0e2f84371d7 --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541037,0 +anli_r2,acc,0.337,0.014955087918653607,0 +anli_r3,acc,0.33416666666666667,0.013622434813136774,0 +arc_challenge,acc,0.28071672354948807,0.013131238126975576,0 +arc_challenge,acc_norm,0.3037542662116041,0.013438909184778766,0 +arc_easy,acc,0.5993265993265994,0.010055304474255573,0 +arc_easy,acc_norm,0.5694444444444444,0.010160345396860082,0 +boolq,acc,0.5752293577981651,0.008645503833361106,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.26622479977906655,,1 +copa,acc,0.81,0.039427724440366234,0 +hellaswag,acc,0.4629555865365465,0.004976067726432562,0 +hellaswag,acc_norm,0.609838677554272,0.004867893927258165,0 +piqa,acc,0.7437431991294886,0.01018578783156506,0 +piqa,acc_norm,0.7524483133841132,0.010069703966857116,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.844,0.011480235006122363,0 +sciq,acc_norm,0.794,0.012795613612786548,0 +storycloze_2016,acc,0.7145911277391769,0.010443395884062115,0 +winogrande,acc,0.5824782951854776,0.013859978264440246,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2_lm-eval_global_step80108_2023-02-25-09-56-03_2shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2_lm-eval_global_step80108_2023-02-25-09-56-03_2shots_backup.json deleted file mode 100644 index d34be2c98d249407f9ed9dc153e3169856dcc8bb..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_2_lm-eval_global_step80108_2023-02-25-09-56-03_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541037 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653607 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136774 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.26622479977906655 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.039427724440366234 - }, - "hellaswag": { - "acc": 0.4629555865365465, - "acc_stderr": 0.004976067726432562, - "acc_norm": 0.609838677554272, - "acc_norm_stderr": 0.004867893927258165 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.5824782951854776, - "acc_stderr": 0.013859978264440246 - }, - "storycloze_2016": { - "acc": 0.7145911277391769, - "acc_stderr": 0.010443395884062115 - }, - "boolq": { - "acc": 0.5752293577981651, - "acc_stderr": 0.008645503833361106 - }, - "arc_easy": { - "acc": 0.5993265993265994, - "acc_stderr": 0.010055304474255573, - "acc_norm": 0.5694444444444444, - "acc_norm_stderr": 0.010160345396860082 - }, - "arc_challenge": { - "acc": 0.28071672354948807, - "acc_stderr": 0.013131238126975576, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.013438909184778766 - }, - "sciq": { - "acc": 0.844, - "acc_stderr": 0.011480235006122363, - "acc_norm": 0.794, - "acc_norm_stderr": 0.012795613612786548 - }, - "piqa": { - "acc": 0.7437431991294886, - "acc_stderr": 0.01018578783156506, - "acc_norm": 0.7524483133841132, - "acc_norm_stderr": 0.010069703966857116 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..544f55b85151118fb6c35e4f282498bd3fdd2512 --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057127,0 +anli_r2,acc,0.337,0.014955087918653609,0 +anli_r3,acc,0.355,0.0138192490040473,0 +arc_challenge,acc,0.27559726962457337,0.013057169655761841,0 +arc_challenge,acc_norm,0.30204778156996587,0.013417519144716413,0 +arc_easy,acc,0.5896464646464646,0.010093531255765457,0 +arc_easy,acc_norm,0.571969696969697,0.01015294331642626,0 +boolq,acc,0.5831804281345566,0.008623192108843677,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.25805555555555554,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4627564230233021,0.004975919665116542,0 +hellaswag,acc_norm,0.6117307309300936,0.004863603638367434,0 +piqa,acc,0.7480957562568009,0.010128421335088683,0 +piqa,acc_norm,0.7595212187159956,0.009971345364651066,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.834,0.011772110370812184,0 +sciq,acc_norm,0.793,0.012818553557843986,0 +storycloze_2016,acc,0.711918760021379,0.010472537019822576,0 +winogrande,acc,0.5824782951854776,0.013859978264440251,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3_lm-eval_global_step80108_2023-02-25-09-54-24_3shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3_lm-eval_global_step80108_2023-02-25-09-54-24_3shots_backup.json deleted file mode 100644 index abc46a1af4298091c160131ae3bec321fcd80c2d..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_3_lm-eval_global_step80108_2023-02-25-09-54-24_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057127 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653609 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.0138192490040473 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.25805555555555554 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4627564230233021, - "acc_stderr": 0.004975919665116542, - "acc_norm": 0.6117307309300936, - "acc_norm_stderr": 0.004863603638367434 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.5824782951854776, - "acc_stderr": 0.013859978264440251 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822576 - }, - "boolq": { - "acc": 0.5831804281345566, - "acc_stderr": 0.008623192108843677 - }, - "arc_easy": { - "acc": 0.5896464646464646, - "acc_stderr": 0.010093531255765457, - "acc_norm": 0.571969696969697, - "acc_norm_stderr": 0.01015294331642626 - }, - "arc_challenge": { - "acc": 0.27559726962457337, - "acc_stderr": 0.013057169655761841, - "acc_norm": 0.30204778156996587, - "acc_norm_stderr": 0.013417519144716413 - }, - "sciq": { - "acc": 0.834, - "acc_stderr": 0.011772110370812184, - "acc_norm": 0.793, - "acc_norm_stderr": 0.012818553557843986 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088683, - "acc_norm": 0.7595212187159956, - "acc_norm_stderr": 0.009971345364651066 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..ed3c8fd785509c51700c73e461d06550ef78cdc1 --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348633,0 +anli_r2,acc,0.317,0.014721675438880236,0 +anli_r3,acc,0.3625,0.013883037874225516,0 +arc_challenge,acc,0.2790102389078498,0.013106784883601333,0 +arc_challenge,acc_norm,0.30802047781569963,0.013491429517292038,0 +arc_easy,acc,0.5942760942760943,0.010075755540128873,0 +arc_easy,acc_norm,0.5757575757575758,0.010141333654958552,0 +boolq,acc,0.5755351681957187,0.008644688121685498,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.19573820395738203,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4592710615415256,0.004973199296339971,0 +hellaswag,acc_norm,0.6106353316072496,0.00486609688094144,0 +piqa,acc,0.7540805223068553,0.010047331865625194,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267314,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.835,0.01174363286691616,0 +sciq,acc_norm,0.788,0.01293148186493805,0 +storycloze_2016,acc,0.7194013896312133,0.01038980964728882,0 +winogrande,acc,0.585635359116022,0.013844846232268565,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4_lm-eval_global_step80108_2023-02-25-09-56-03_4shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4_lm-eval_global_step80108_2023-02-25-09-56-03_4shots_backup.json deleted file mode 100644 index f2ae67f02b17af580c2c82aa51754a8399e7cbe8..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_4_lm-eval_global_step80108_2023-02-25-09-56-03_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348633 - }, - "anli_r2": { - "acc": 0.317, - "acc_stderr": 0.014721675438880236 - }, - "anli_r3": { - "acc": 0.3625, - "acc_stderr": 0.013883037874225516 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.19573820395738203 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4592710615415256, - "acc_stderr": 0.004973199296339971, - "acc_norm": 0.6106353316072496, - "acc_norm_stderr": 0.00486609688094144 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268565 - }, - "storycloze_2016": { - "acc": 0.7194013896312133, - "acc_stderr": 0.01038980964728882 - }, - "boolq": { - "acc": 0.5755351681957187, - "acc_stderr": 0.008644688121685498 - }, - "arc_easy": { - "acc": 0.5942760942760943, - "acc_stderr": 0.010075755540128873, - "acc_norm": 0.5757575757575758, - "acc_norm_stderr": 0.010141333654958552 - }, - "arc_challenge": { - "acc": 0.2790102389078498, - "acc_stderr": 0.013106784883601333, - "acc_norm": 0.30802047781569963, - "acc_norm_stderr": 0.013491429517292038 - }, - "sciq": { - "acc": 0.835, - "acc_stderr": 0.01174363286691616, - "acc_norm": 0.788, - "acc_norm_stderr": 0.01293148186493805 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.010047331865625194, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267314 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5.csv b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..eb3fe617ce15558facd348acaddcc1fd5d2f97fc --- /dev/null +++ b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738857,0 +anli_r2,acc,0.338,0.014965960710224498,0 +anli_r3,acc,0.3525,0.013797164918918362,0 +arc_challenge,acc,0.2841296928327645,0.013179442447653887,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.6043771043771043,0.010033741393430983,0 +arc_easy,acc_norm,0.5749158249158249,0.010143966195717845,0 +boolq,acc,0.5730886850152905,0.008651119069643816,1 +cb,acc,0.42857142857142855,0.06672848092813057,1 +cb,f1,0.25882352941176473,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.45907189802828124,0.004973036453863711,0 +hellaswag,acc_norm,0.6099382593108943,0.004867670042866713,0 +piqa,acc,0.7480957562568009,0.010128421335088683,0 +piqa,acc_norm,0.7573449401523396,0.01000200256970869,0 +rte,acc,0.5234657039711191,0.030063300411902652,0 +sciq,acc,0.836,0.011715000693181331,0 +sciq,acc_norm,0.791,0.012864077288499337,0 +storycloze_2016,acc,0.7151256012827365,0.010437513986611718,0 +winogrande,acc,0.5777426992896606,0.013881582030658552,0 diff --git a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5_lm-eval_global_step80108_2023-02-25-09-56-03_5shots_backup.json b/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5_lm-eval_global_step80108_2023-02-25-09-56-03_5shots_backup.json deleted file mode 100644 index ce35a8185f0c6cfc04c1d314afa35af83c1e48f8..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed1/evaluation/rankeval/4b284b12bc4seed1_5_lm-eval_global_step80108_2023-02-25-09-56-03_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738857 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224498 - }, - "anli_r3": { - "acc": 0.3525, - "acc_stderr": 0.013797164918918362 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813057, - "f1": 0.25882352941176473 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.45907189802828124, - "acc_stderr": 0.004973036453863711, - "acc_norm": 0.6099382593108943, - "acc_norm_stderr": 0.004867670042866713 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.5777426992896606, - "acc_stderr": 0.013881582030658552 - }, - "storycloze_2016": { - "acc": 0.7151256012827365, - "acc_stderr": 0.010437513986611718 - }, - "boolq": { - "acc": 0.5730886850152905, - "acc_stderr": 0.008651119069643816 - }, - "arc_easy": { - "acc": 0.6043771043771043, - "acc_stderr": 0.010033741393430983, - "acc_norm": 0.5749158249158249, - "acc_norm_stderr": 0.010143966195717845 - }, - "arc_challenge": { - "acc": 0.2841296928327645, - "acc_stderr": 0.013179442447653887, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - }, - "sciq": { - "acc": 0.836, - "acc_stderr": 0.011715000693181331, - "acc_norm": 0.791, - "acc_norm_stderr": 0.012864077288499337 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088683, - "acc_norm": 0.7573449401523396, - "acc_norm_stderr": 0.01000200256970869 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d152f43c384fbe170ac320c3b4e1d5b35dc37ea5 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5083902070671548, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03989468844910082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07528694149270759, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001485415514756619}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3780318171066232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005223104661769293}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11728631797246779, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019247654134608228}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03516210005523054, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009572816724584412}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18705373069728767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037081482160342696}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05468830386197015, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012066786386101694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07002377028639689, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013498282472237842}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3530593907516912, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00490724418597446}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10921255839870517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017509846379559321}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07051076285503056, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014029195285359902}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3526831574759125, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004764676209885756}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10965304015302584, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00179658805334725}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8e4f16013d82d5b10bebfcf36b4d6085de8d88cf --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.525451157972169, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.020807029076595943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07578151398935791, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0014658497562735056}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3776088572433392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005222009047076881}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11815166842945922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019539933108443297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035216953195913794, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008936559179157622}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18552571500246623, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003651941896665819}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05505652232011781, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012284406115594482}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07009792254358821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013082058038069526}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.35054744472531996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004791505363230279}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1094938648119312, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017598889432284988}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07078544810917187, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0013654984516325997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.35187230618740717, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004734030010907423}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11026920423657147, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018110860096435387}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..42a9fbd516d137f5d2b5cc5502f7519e2c33b05d --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.0073229441233485515, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007507591641053894}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.01188187354308511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001193460962416848}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.007782363958910315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007494140926600601}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0014793812141488027, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000229663938081861}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002825244488953548, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005238543235168785}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0016405694963126324, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00023525203173163016}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.005727289612377416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005861714348780361}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009446302855839223, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009620615699727216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006018804038099202, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005623113681535245}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.00679434254221087, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006951873055254002}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.011060401872827133, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011163157558010933}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.007193842982164766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006896239402426108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.090669570430738e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 8.489969077479997e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..dbcabe9b4ed0c00430924f85b4cf95087db0309b --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.255556843183474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14306801493502502}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40133894612714033, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022786254106147374}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4353400028716086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025934401772713216}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.40223710706751803, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019113695305159313}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17167615815004073, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0017373085671162421}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18892368452661648, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002036271603133117}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17238412451550736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001647066730421544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2895776384246929, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0018377899797485326}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31561872620267456, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002186944465790045}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2905126850575018, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016066395148831222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.33661571162841014, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0021532181771208945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36620748100065265, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002517396529311699}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3376761630795252, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001906552910299397}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c3156f6ce87d5de76b9fff3be4cb975a719ecb78 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 10.453538794646896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.1018770051872085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40269265483237937, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022414598319616666}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4321840404996293, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002501669112723814}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.40172960710680156, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018233890583034939}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17489910198304812, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001749578738044224}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18884294973750174, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001942308180411026}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17413364387783356, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001607923940126345}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.29510908654175555, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00188243407356544}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.31785854835047744, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021450790852441397}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.29458902141806165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016001785328170475}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3399297127876474, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002185077308803786}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.364926897184195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00242487417897827}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3390409817775856, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018684469365198332}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c74da8966851d12613c86d92e0b8a33560ade868 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.024667231581309436, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016446798164940699}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.04578732040023698, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027949727542311478}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.029257582846789993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017353864694049216}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0025778362583607642, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0005426786000843293}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.005056626447538328, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008909529686166358}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0029439626544144923, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00046267375464988764}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.021633378868917415, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014766881089830067}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.039954198597200195, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002427151773773013}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.025397669122061574, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001477824062133286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.021302386042875585, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014719776290891184}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.039097440525313706, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023851152450321484}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.024937374055980524, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014662690411890744}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.27307893333920474, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07257669784005619}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e47af6a5320c3947e89276578c2f702a3bfa1c --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/agg.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0016784285108223424, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00046933173483779724}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.001546401819187842, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00043408961008404196}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0015539306240835566, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00042422089610158747}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 5.04489960649783e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 5.044899606497852e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 5.360205831903945e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 5.360205831903959e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 5.19777535214928e-05, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 5.1977753521493134e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0015267311823769563, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00042072964022598367}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001446281044627963, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000414342893101565}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0014344496420688818, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003937675400802755}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0015235781201228954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004126564052488673}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0014595891418657935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00041568179279564054}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.001442691159744362, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003927017017637451}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 8.502508359632142e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.6224422585741e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e2184aca879ee312d3efd9c59106d0add7d94ff4 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523a10be06a013f2bb05811547226fd98dc369df3847b27724be8a1aafea3c94 +size 7826518 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b65d5c73f6ebfd6e764a6f01d7454a4d0bb8ecfb --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c9ffafef83f6d710e4c86cb60ee2c9777154d34e747dfdd445961ea9eb1316 +size 8717380 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e7e7a13dd3c33336d1131558565d93d929ece1e2 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ac756f88a66239f6b8606c2b7f66959a7836f8ea9af0ec79223498c6533292d +size 34799887 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..83002a4f6c719b5fa7c9962e294920f99dc85478 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7fe94a928fa0803373afccab617f52f5aec27b430fa0af243ba7682a6989937 +size 8394602 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..530713667ef8e4062c1f3126476921b112c178e8 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b722acf4e2608d29748cb8ee7920902003c939595743a4cf0cdf83c583f45b3b +size 9478188 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1196cbc32963b4522d90f7281039ca313ccd5347 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09666f303c2b90cbeeba1c009c32e5ca8e2e465948115d1c01bd446de824cc7 +size 11672650 diff --git a/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..6786060d7102b15384259ad65b252581f4a62be5 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/examples.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac678d77590701961941c02a30d1b52c4bb0eb404159f52e5caf1640711028a1 +size 13897546 diff --git a/4b284b12bc4seed2/evaluation/generation/merged.csv b/4b284b12bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..b5cc7e53a9f1b4eb9a84055c1c468509b85bf22e --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0043911029729981465 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0043911029729981465 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.14634556896551848 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.14634556896551848 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.16691425263662168 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16691425263662168 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.17476964694401387 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.17476964694401387 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.17238412451550736 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.17238412451550736 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.17413364387783356 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17413364387783356 +e2e_nlg_cleaned,5,average,multiple,0.1398230566520822 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.01153407231838246 +gem_xsum,0,median,rouge2_fmeasure,0.01153407231838246 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.01870363630762986 +gem_xsum,1,median,rouge2_fmeasure,0.01870363630762986 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.02149971321984745 +gem_xsum,2,median,rouge2_fmeasure,0.02149971321984745 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.012510492483954895 +gem_xsum,3,median,rouge2_fmeasure,0.012510492483954895 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0029439626544144923 +gem_xsum,4,median,rouge2_fmeasure,0.0029439626544144923 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,5.19777535214928e-05 +gem_xsum,5,median,rouge2_fmeasure,5.19777535214928e-05 +gem_xsum,5,average,multiple,0.01120730912295844 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05341982709541381 +web_nlg_en,0,median,rouge2_fmeasure,0.05341982709541381 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05569920972530597 +web_nlg_en,1,median,rouge2_fmeasure,0.05569920972530597 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.055829123679104704 +web_nlg_en,2,median,rouge2_fmeasure,0.055829123679104704 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0555802940665526 +web_nlg_en,3,median,rouge2_fmeasure,0.0555802940665526 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05468830386197015 +web_nlg_en,4,median,rouge2_fmeasure,0.05468830386197015 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05505652232011781 +web_nlg_en,5,median,rouge2_fmeasure,0.05505652232011781 +web_nlg_en,5,average,multiple,0.05504554679141084 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.005442298869452033 +wiki_lingua_en,0,median,rouge2_fmeasure,0.005442298869452033 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.033304794776064954 +wiki_lingua_en,1,median,rouge2_fmeasure,0.033304794776064954 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.03566526748783912 +wiki_lingua_en,2,median,rouge2_fmeasure,0.03566526748783912 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.02943630075177706 +wiki_lingua_en,3,median,rouge2_fmeasure,0.02943630075177706 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009975083716813172 +wiki_lingua_en,4,median,rouge2_fmeasure,0.009975083716813172 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0016405694963126324 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0016405694963126324 +wiki_lingua_en,5,average,multiple,0.019244052516376495 diff --git a/4b284b12bc4seed2/evaluation/generation/merged.json b/4b284b12bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..cda2ee526cd5f98b540a71ae568c63ae4a8c1513 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3666975928567752, "bleu_stderr": 0.033942619502247987, "rouge1_fmeasure": 0.11410488927905772, "rouge1_fmeasure_stderr": 0.0021273756435243453, "rouge1_precision": 0.07654378919459422, "rouge1_precision_stderr": 0.0018618261262039134, "rouge1_recall": 0.32609614121370334, "rouge1_recall_stderr": 0.004974989840599945, "rouge2_fmeasure": 0.05341982709541381, "rouge2_fmeasure_stderr": 0.001325737944102095, "rouge2_precision": 0.03603776896987243, "rouge2_precision_stderr": 0.0012493286067619042, "rouge2_recall": 0.15877363093279048, "rouge2_recall_stderr": 0.003408901924533892, "rougeL_fmeasure": 0.10840968841172977, "rougeL_fmeasure_stderr": 0.0019678561218047716, "rougeL_precision": 0.07257143485800877, "rougeL_precision_stderr": 0.0017468394574395602, "rougeL_recall": 0.31289502631185434, "rougeL_recall_stderr": 0.004801473267924515, "rougeLsum_fmeasure": 0.10730895702341088, "rougeLsum_fmeasure_stderr": 0.0019852721665848463, "rougeLsum_precision": 0.0721278567630955, "rougeLsum_precision_stderr": 0.0017765340801950376, "rougeLsum_recall": 0.30700384670165076, "rougeLsum_recall_stderr": 0.004639025717519335}}, "1": {"PALM_prompt": {"bleu": 0.4204000069724605, "bleu_stderr": 0.03454709980239004, "rouge1_fmeasure": 0.11863230545480506, "rouge1_fmeasure_stderr": 0.0020998490379809595, "rouge1_precision": 0.07729381671566446, "rouge1_precision_stderr": 0.0016249952401225709, "rouge1_recall": 0.3463514185710177, "rouge1_recall_stderr": 0.004906324748729579, "rouge2_fmeasure": 0.05569920972530597, "rouge2_fmeasure_stderr": 0.0013333357482266375, "rouge2_precision": 0.036344067746574706, "rouge2_precision_stderr": 0.0010138056526646537, "rouge2_recall": 0.1684509670618413, "rouge2_recall_stderr": 0.0034374849683514908, "rougeL_fmeasure": 0.11211899409488457, "rougeL_fmeasure_stderr": 0.0019248317217534817, "rougeL_precision": 0.0727861600331886, "rougeL_precision_stderr": 0.0014689871591747046, "rougeL_recall": 0.32987484171743015, "rougeL_recall_stderr": 0.004706952933270804, "rougeLsum_fmeasure": 0.11160505052176106, "rougeLsum_fmeasure_stderr": 0.001958165298495008, "rougeLsum_precision": 0.07272118809874989, "rougeLsum_precision_stderr": 0.0015190345258311126, "rougeLsum_recall": 0.3259034212737053, "rougeLsum_recall_stderr": 0.004542325342065352}}, "2": {"PALM_prompt": {"bleu": 0.4748201943710615, "bleu_stderr": 0.030699841465897198, "rouge1_fmeasure": 0.11834242699459707, "rouge1_fmeasure_stderr": 0.0020730822161238323, "rouge1_precision": 0.07667243246356968, "rouge1_precision_stderr": 0.0015739220370015814, "rouge1_recall": 0.35665862094926554, "rouge1_recall_stderr": 0.005095225993954892, "rouge2_fmeasure": 0.055829123679104704, "rouge2_fmeasure_stderr": 0.0013065760171567398, "rouge2_precision": 0.03605855238775504, "rouge2_precision_stderr": 0.0009560579302419175, "rouge2_recall": 0.17612198539506513, "rouge2_recall_stderr": 0.0035618179908472056, "rougeL_fmeasure": 0.11117048364501488, "rougeL_fmeasure_stderr": 0.0018719625581382908, "rougeL_precision": 0.07180378962637364, "rougeL_precision_stderr": 0.0014070124296651453, "rougeL_recall": 0.33774827297948934, "rougeL_recall_stderr": 0.004811147115289192, "rougeLsum_fmeasure": 0.11132148782522473, "rougeLsum_fmeasure_stderr": 0.001919293265591665, "rougeLsum_precision": 0.07214285401823774, "rougeLsum_precision_stderr": 0.0014626628042382018, "rougeLsum_recall": 0.33633909533117384, "rougeLsum_recall_stderr": 0.00472655160720387}}, "3": {"PALM_prompt": {"bleu": 0.5549733758882326, "bleu_stderr": 0.03334480985874615, "rouge1_fmeasure": 0.11965471898211823, "rouge1_fmeasure_stderr": 0.0020402171706270338, "rouge1_precision": 0.07698392131996552, "rouge1_precision_stderr": 0.0015413715512529594, "rouge1_recall": 0.37726780498158974, "rouge1_recall_stderr": 0.005202842583236199, "rouge2_fmeasure": 0.0555802940665526, "rouge2_fmeasure_stderr": 0.0012958506193786643, "rouge2_precision": 0.0357119459728351, "rouge2_precision_stderr": 0.0009447013213754063, "rouge2_recall": 0.18257224954432466, "rouge2_recall_stderr": 0.0036301676144376914, "rougeL_fmeasure": 0.11086203282772523, "rougeL_fmeasure_stderr": 0.0018308098587233419, "rougeL_precision": 0.07108997931335904, "rougeL_precision_stderr": 0.0013561691764805398, "rougeL_recall": 0.3515365565823446, "rougeL_recall_stderr": 0.004828539628946869, "rougeLsum_fmeasure": 0.11168460054725654, "rougeLsum_fmeasure_stderr": 0.0018725810295441223, "rougeLsum_precision": 0.07183721682893512, "rougeLsum_precision_stderr": 0.0014096707081297398, "rougeLsum_recall": 0.3524043706111252, "rougeLsum_recall_stderr": 0.004734089066449836}}, "4": {"PALM_prompt": {"bleu": 0.5083902070671548, "bleu_stderr": 0.03989468844910082, "rouge1_fmeasure": 0.11728631797246779, "rouge1_fmeasure_stderr": 0.0019247654134608228, "rouge1_precision": 0.07528694149270759, "rouge1_precision_stderr": 0.001485415514756619, "rouge1_recall": 0.3780318171066232, "rouge1_recall_stderr": 0.005223104661769293, "rouge2_fmeasure": 0.05468830386197015, "rouge2_fmeasure_stderr": 0.0012066786386101694, "rouge2_precision": 0.03516210005523054, "rouge2_precision_stderr": 0.0009572816724584412, "rouge2_recall": 0.18705373069728767, "rouge2_recall_stderr": 0.0037081482160342696, "rougeL_fmeasure": 0.10921255839870517, "rougeL_fmeasure_stderr": 0.0017509846379559321, "rougeL_precision": 0.07002377028639689, "rougeL_precision_stderr": 0.0013498282472237842, "rougeL_recall": 0.3530593907516912, "rougeL_recall_stderr": 0.00490724418597446, "rougeLsum_fmeasure": 0.10965304015302584, "rougeLsum_fmeasure_stderr": 0.00179658805334725, "rougeLsum_precision": 0.07051076285503056, "rougeLsum_precision_stderr": 0.0014029195285359902, "rougeLsum_recall": 0.3526831574759125, "rougeLsum_recall_stderr": 0.004764676209885756}}, "5": {"PALM_prompt": {"bleu": 0.525451157972169, "bleu_stderr": 0.020807029076595943, "rouge1_fmeasure": 0.11815166842945922, "rouge1_fmeasure_stderr": 0.0019539933108443297, "rouge1_precision": 0.07578151398935791, "rouge1_precision_stderr": 0.0014658497562735056, "rouge1_recall": 0.3776088572433392, "rouge1_recall_stderr": 0.005222009047076881, "rouge2_fmeasure": 0.05505652232011781, "rouge2_fmeasure_stderr": 0.0012284406115594482, "rouge2_precision": 0.035216953195913794, "rouge2_precision_stderr": 0.0008936559179157622, "rouge2_recall": 0.18552571500246623, "rouge2_recall_stderr": 0.003651941896665819, "rougeL_fmeasure": 0.1094938648119312, "rougeL_fmeasure_stderr": 0.0017598889432284988, "rougeL_precision": 0.07009792254358821, "rougeL_precision_stderr": 0.0013082058038069526, "rougeL_recall": 0.35054744472531996, "rougeL_recall_stderr": 0.004791505363230279, "rougeLsum_fmeasure": 0.11026920423657147, "rougeLsum_fmeasure_stderr": 0.0018110860096435387, "rougeLsum_precision": 0.07078544810917187, "rougeLsum_precision_stderr": 0.0013654984516325997, "rougeLsum_recall": 0.35187230618740717, "rougeLsum_recall_stderr": 0.004734030010907423}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.24392437655372481, "bleu_stderr": 0.023857847767670762, "rouge1_fmeasure": 0.09276892306618244, "rouge1_fmeasure_stderr": 0.0011960344018118775, "rouge1_precision": 0.08041542381459962, "rouge1_precision_stderr": 0.0012034347430153701, "rouge1_recall": 0.12977261642720425, "rouge1_recall_stderr": 0.0016061415032672362, "rouge2_fmeasure": 0.005442298869452033, "rouge2_fmeasure_stderr": 0.00031336134550929984, "rouge2_precision": 0.00482101020032747, "rouge2_precision_stderr": 0.000288358623019921, "rouge2_recall": 0.007392323231333513, "rouge2_recall_stderr": 0.0004384293210501124, "rougeL_fmeasure": 0.08081272257942029, "rougeL_fmeasure_stderr": 0.000958641259754323, "rougeL_precision": 0.06948188762406071, "rougeL_precision_stderr": 0.0009562354726599046, "rougeL_recall": 0.1148205615117217, "rougeL_recall_stderr": 0.0013763320189433634, "rougeLsum_fmeasure": 0.08675704545208646, "rougeLsum_fmeasure_stderr": 0.0011000237175663734, "rougeLsum_precision": 0.07505126481568339, "rougeLsum_precision_stderr": 0.0011046552942777078, "rougeLsum_recall": 0.12191849988137807, "rougeLsum_recall_stderr": 0.001502332271175678}}, "1": {"tldr_en": {"bleu": 1.6125876717654175, "bleu_stderr": 0.030611128690029174, "rouge1_fmeasure": 0.17098154370812704, "rouge1_fmeasure_stderr": 0.0019238619176074028, "rouge1_precision": 0.1455337083731935, "rouge1_precision_stderr": 0.0019032086865170478, "rouge1_recall": 0.25012143517213287, "rouge1_recall_stderr": 0.0028277377897345103, "rouge2_fmeasure": 0.033304794776064954, "rouge2_fmeasure_stderr": 0.0008869000313608735, "rouge2_precision": 0.028195165306937067, "rouge2_precision_stderr": 0.0007889557545714422, "rouge2_recall": 0.0503519559667884, "rouge2_recall_stderr": 0.001442734800038838, "rougeL_fmeasure": 0.12805479000603123, "rougeL_fmeasure_stderr": 0.0012729011455461764, "rougeL_precision": 0.10772839012506621, "rougeL_precision_stderr": 0.0012393912404238278, "rougeL_recall": 0.19185007869835716, "rougeL_recall_stderr": 0.002113757458604234, "rougeLsum_fmeasure": 0.15864492369929892, "rougeLsum_fmeasure_stderr": 0.001762860943938729, "rougeLsum_precision": 0.13484649948542737, "rougeLsum_precision_stderr": 0.001744825336362372, "rougeLsum_recall": 0.2327450602622433, "rougeLsum_recall_stderr": 0.002613147650213692}}, "2": {"tldr_en": {"bleu": 1.7772115660325392, "bleu_stderr": 0.047936589027589356, "rouge1_fmeasure": 0.17893721887231873, "rouge1_fmeasure_stderr": 0.0019115838216723414, "rouge1_precision": 0.1521730564438585, "rouge1_precision_stderr": 0.0019244965733381224, "rouge1_recall": 0.2619409970589743, "rouge1_recall_stderr": 0.002792920076539665, "rouge2_fmeasure": 0.03566526748783912, "rouge2_fmeasure_stderr": 0.0008976747682681302, "rouge2_precision": 0.030121700253615085, "rouge2_precision_stderr": 0.0007987546017186038, "rouge2_recall": 0.05440936550544913, "rouge2_recall_stderr": 0.0015261127047882838, "rougeL_fmeasure": 0.1348199640996336, "rougeL_fmeasure_stderr": 0.0012709671031115265, "rougeL_precision": 0.11333400415154432, "rougeL_precision_stderr": 0.001252662085867768, "rougeL_recall": 0.20213029522545098, "rougeL_recall_stderr": 0.0021327420832715108, "rougeLsum_fmeasure": 0.16603969831441404, "rougeLsum_fmeasure_stderr": 0.0017701912652569377, "rougeLsum_precision": 0.14105987920215718, "rougeLsum_precision_stderr": 0.0017803037903608345, "rougeLsum_recall": 0.24387535828634377, "rougeLsum_recall_stderr": 0.002626333657910018}}, "3": {"tldr_en": {"bleu": 1.7384602326131295, "bleu_stderr": 0.07786433868102548, "rouge1_fmeasure": 0.1475192981997226, "rouge1_fmeasure_stderr": 0.0020874234612903364, "rouge1_precision": 0.13124657860364822, "rouge1_precision_stderr": 0.0022184691237030767, "rouge1_recall": 0.21580668554586308, "rouge1_recall_stderr": 0.0031217848837206867, "rouge2_fmeasure": 0.02943630075177706, "rouge2_fmeasure_stderr": 0.0008619263938236904, "rouge2_precision": 0.02556631610400526, "rouge2_precision_stderr": 0.0008289864287315672, "rouge2_recall": 0.04489589142005177, "rouge2_recall_stderr": 0.0014219423335759458, "rougeL_fmeasure": 0.11170409200150849, "rougeL_fmeasure_stderr": 0.001457057228328002, "rougeL_precision": 0.09922448432903473, "rougeL_precision_stderr": 0.0016571195136742573, "rougeL_recall": 0.1671818366386249, "rougeL_recall_stderr": 0.002414244291432632, "rougeLsum_fmeasure": 0.1370205638904663, "rougeLsum_fmeasure_stderr": 0.0019368277108348473, "rougeLsum_precision": 0.12208780038033144, "rougeLsum_precision_stderr": 0.002084036711937265, "rougeLsum_recall": 0.20069686438127185, "rougeLsum_recall_stderr": 0.0029103319320837972}}, "4": {"tldr_en": {"bleu": 0.3960094829116993, "bleu_stderr": 0.03202328755305624, "rouge1_fmeasure": 0.04982811807836869, "rouge1_fmeasure_stderr": 0.0017359012937995398, "rouge1_precision": 0.04553666129964189, "rouge1_precision_stderr": 0.0017147390368758152, "rouge1_recall": 0.07503959025491676, "rouge1_recall_stderr": 0.0026592166476963725, "rouge2_fmeasure": 0.009975083716813172, "rouge2_fmeasure_stderr": 0.0005509539367896302, "rouge2_precision": 0.008610828899341378, "rouge2_precision_stderr": 0.0005000696489982937, "rouge2_recall": 0.01599372920183621, "rouge2_recall_stderr": 0.0009747815347100425, "rougeL_fmeasure": 0.0384196111991066, "rougeL_fmeasure_stderr": 0.0012953226612167558, "rougeL_precision": 0.03503994473270826, "rougeL_precision_stderr": 0.0012923406095347262, "rougeL_recall": 0.059122890511189616, "rougeL_recall_stderr": 0.0020945165554899304, "rougeLsum_fmeasure": 0.04642174084177454, "rougeLsum_fmeasure_stderr": 0.0016156979564290537, "rougeLsum_precision": 0.04241127409619883, "rougeLsum_precision_stderr": 0.0015945350214724905, "rougeLsum_recall": 0.07008648862134945, "rougeLsum_recall_stderr": 0.002485730743546559}}, "5": {"tldr_en": {"bleu": 5.090669570430738e-07, "bleu_stderr": 8.489969077479997e-07, "rouge1_fmeasure": 0.007782363958910315, "rouge1_fmeasure_stderr": 0.0007494140926600601, "rouge1_precision": 0.0073229441233485515, "rouge1_precision_stderr": 0.0007507591641053894, "rouge1_recall": 0.01188187354308511, "rouge1_recall_stderr": 0.001193460962416848, "rouge2_fmeasure": 0.0016405694963126324, "rouge2_fmeasure_stderr": 0.00023525203173163016, "rouge2_precision": 0.0014793812141488027, "rouge2_precision_stderr": 0.000229663938081861, "rouge2_recall": 0.002825244488953548, "rouge2_recall_stderr": 0.0005238543235168785, "rougeL_fmeasure": 0.006018804038099202, "rougeL_fmeasure_stderr": 0.0005623113681535245, "rougeL_precision": 0.005727289612377416, "rougeL_precision_stderr": 0.0005861714348780361, "rougeL_recall": 0.009446302855839223, "rougeL_recall_stderr": 0.0009620615699727216, "rougeLsum_fmeasure": 0.007193842982164766, "rougeLsum_fmeasure_stderr": 0.0006896239402426108, "rougeLsum_precision": 0.00679434254221087, "rougeLsum_precision_stderr": 0.0006951873055254002, "rougeLsum_recall": 0.011060401872827133, "rougeLsum_recall_stderr": 0.0011163157558010933}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.04307253383096468, "bleu_stderr": 0.00615780467434044, "rouge1_fmeasure": 0.046890391429100335, "rouge1_fmeasure_stderr": 0.0007011195091317708, "rouge1_precision": 0.05053375977322034, "rouge1_precision_stderr": 0.0010158112551469237, "rouge1_recall": 0.0652936855655388, "rouge1_recall_stderr": 0.0011167830442822143, "rouge2_fmeasure": 0.0043911029729981465, "rouge2_fmeasure_stderr": 0.00020467393315053172, "rouge2_precision": 0.0035689796753305652, "rouge2_precision_stderr": 0.00021117638207178657, "rouge2_recall": 0.00680143761288298, "rouge2_recall_stderr": 0.0003224069999120447, "rougeL_fmeasure": 0.04657371060940762, "rougeL_fmeasure_stderr": 0.0006904790861998436, "rougeL_precision": 0.05006831349983385, "rougeL_precision_stderr": 0.0009936655556977518, "rougeL_recall": 0.06495084101211217, "rougeL_recall_stderr": 0.0011078411551186476, "rougeLsum_fmeasure": 0.0414614170947359, "rougeLsum_fmeasure_stderr": 0.0006139268554968763, "rougeLsum_precision": 0.046457395636209424, "rougeLsum_precision_stderr": 0.0010026919226726794, "rougeLsum_recall": 0.056636945108678484, "rougeLsum_recall_stderr": 0.0009405550431936621}}, "1": {"generate_text_restaurant": {"bleu": 7.5581936543754376, "bleu_stderr": 0.11126839308169957, "rouge1_fmeasure": 0.3720334093393895, "rouge1_fmeasure_stderr": 0.0021887575462331737, "rouge1_precision": 0.36468914807777286, "rouge1_precision_stderr": 0.0026191701242675273, "rouge1_recall": 0.42454236506789683, "rouge1_recall_stderr": 0.002854215718266759, "rouge2_fmeasure": 0.14634556896551848, "rouge2_fmeasure_stderr": 0.001551058335752227, "rouge2_precision": 0.14368390527185781, "rouge2_precision_stderr": 0.0017338258439567463, "rouge2_recall": 0.16851243140438502, "rouge2_recall_stderr": 0.001912073180376547, "rougeL_fmeasure": 0.2545609931556668, "rougeL_fmeasure_stderr": 0.0016225757478674334, "rougeL_precision": 0.2501530047146125, "rougeL_precision_stderr": 0.0019969846559516353, "rougeL_recall": 0.29284296638863844, "rougeL_recall_stderr": 0.0022356510211168855, "rougeLsum_fmeasure": 0.30559945190747523, "rougeLsum_fmeasure_stderr": 0.002005656004936061, "rougeLsum_precision": 0.30072566914524307, "rougeLsum_precision_stderr": 0.0023876131484858974, "rougeLsum_recall": 0.34812960779874275, "rougeLsum_recall_stderr": 0.0025571739561978655}}, "2": {"generate_text_restaurant": {"bleu": 9.401928562229555, "bleu_stderr": 0.1567763787830054, "rouge1_fmeasure": 0.40026173535365556, "rouge1_fmeasure_stderr": 0.0019715999520502525, "rouge1_precision": 0.40080790096672786, "rouge1_precision_stderr": 0.0023431638323800513, "rouge1_recall": 0.43692976078064044, "rouge1_recall_stderr": 0.0027384926504820336, "rouge2_fmeasure": 0.16691425263662168, "rouge2_fmeasure_stderr": 0.001584309925310028, "rouge2_precision": 0.16690793931954076, "rouge2_precision_stderr": 0.0017426038897506064, "rouge2_recall": 0.1845979813386405, "rouge2_recall_stderr": 0.0019887719670502926, "rougeL_fmeasure": 0.2778592068167227, "rougeL_fmeasure_stderr": 0.0016171214159128506, "rougeL_precision": 0.2785202312030322, "rougeL_precision_stderr": 0.0019011043706789524, "rougeL_recall": 0.3046480183356087, "rougeL_recall_stderr": 0.0022397319337808465, "rougeLsum_fmeasure": 0.3300400794380935, "rougeLsum_fmeasure_stderr": 0.0019018703890833699, "rougeLsum_precision": 0.3309782590686788, "rougeLsum_precision_stderr": 0.002210876758312972, "rougeLsum_recall": 0.3603290477325708, "rougeLsum_recall_stderr": 0.0025365179929270143}}, "3": {"generate_text_restaurant": {"bleu": 10.213394889083318, "bleu_stderr": 0.13997266948295084, "rouge1_fmeasure": 0.40620019916876426, "rouge1_fmeasure_stderr": 0.001889361609626469, "rouge1_precision": 0.40417575321020377, "rouge1_precision_stderr": 0.00227014310665305, "rouge1_recall": 0.44263977445184194, "rouge1_recall_stderr": 0.002603197211967317, "rouge2_fmeasure": 0.17476964694401387, "rouge2_fmeasure_stderr": 0.0016365140191570804, "rouge2_precision": 0.17336100314085984, "rouge2_precision_stderr": 0.001719523491610732, "rouge2_recall": 0.1925950069188434, "rouge2_recall_stderr": 0.002008348010949252, "rougeL_fmeasure": 0.2890898692590789, "rougeL_fmeasure_stderr": 0.0016161064052956943, "rougeL_precision": 0.2876013029701323, "rougeL_precision_stderr": 0.0018718292268598646, "rougeL_recall": 0.31615779730252647, "rougeL_recall_stderr": 0.0021802769594592, "rougeLsum_fmeasure": 0.3406161582563531, "rougeLsum_fmeasure_stderr": 0.001893065321419412, "rougeLsum_precision": 0.3390891522957183, "rougeLsum_precision_stderr": 0.0021744867159723394, "rougeLsum_recall": 0.37119788702454404, "rougeLsum_recall_stderr": 0.0024763950418716834}}, "4": {"generate_text_restaurant": {"bleu": 10.255556843183474, "bleu_stderr": 0.14306801493502502, "rouge1_fmeasure": 0.40223710706751803, "rouge1_fmeasure_stderr": 0.0019113695305159313, "rouge1_precision": 0.40133894612714033, "rouge1_precision_stderr": 0.0022786254106147374, "rouge1_recall": 0.4353400028716086, "rouge1_recall_stderr": 0.0025934401772713216, "rouge2_fmeasure": 0.17238412451550736, "rouge2_fmeasure_stderr": 0.001647066730421544, "rouge2_precision": 0.17167615815004073, "rouge2_precision_stderr": 0.0017373085671162421, "rouge2_recall": 0.18892368452661648, "rouge2_recall_stderr": 0.002036271603133117, "rougeL_fmeasure": 0.2905126850575018, "rougeL_fmeasure_stderr": 0.0016066395148831222, "rougeL_precision": 0.2895776384246929, "rougeL_precision_stderr": 0.0018377899797485326, "rougeL_recall": 0.31561872620267456, "rougeL_recall_stderr": 0.002186944465790045, "rougeLsum_fmeasure": 0.3376761630795252, "rougeLsum_fmeasure_stderr": 0.001906552910299397, "rougeLsum_precision": 0.33661571162841014, "rougeLsum_precision_stderr": 0.0021532181771208945, "rougeLsum_recall": 0.36620748100065265, "rougeLsum_recall_stderr": 0.002517396529311699}}, "5": {"generate_text_restaurant": {"bleu": 10.453538794646896, "bleu_stderr": 0.1018770051872085, "rouge1_fmeasure": 0.40172960710680156, "rouge1_fmeasure_stderr": 0.0018233890583034939, "rouge1_precision": 0.40269265483237937, "rouge1_precision_stderr": 0.0022414598319616666, "rouge1_recall": 0.4321840404996293, "rouge1_recall_stderr": 0.002501669112723814, "rouge2_fmeasure": 0.17413364387783356, "rouge2_fmeasure_stderr": 0.001607923940126345, "rouge2_precision": 0.17489910198304812, "rouge2_precision_stderr": 0.001749578738044224, "rouge2_recall": 0.18884294973750174, "rouge2_recall_stderr": 0.001942308180411026, "rougeL_fmeasure": 0.29458902141806165, "rougeL_fmeasure_stderr": 0.0016001785328170475, "rougeL_precision": 0.29510908654175555, "rougeL_precision_stderr": 0.00188243407356544, "rougeL_recall": 0.31785854835047744, "rougeL_recall_stderr": 0.0021450790852441397, "rougeLsum_fmeasure": 0.3390409817775856, "rougeLsum_fmeasure_stderr": 0.0018684469365198332, "rougeLsum_precision": 0.3399297127876474, "rougeLsum_precision_stderr": 0.002185077308803786, "rougeLsum_recall": 0.364926897184195, "rougeLsum_recall_stderr": 0.00242487417897827}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.4630139659504365, "bleu_stderr": 0.07092608567266476, "rouge1_fmeasure": 0.11690451064634964, "rouge1_fmeasure_stderr": 0.001967787449479756, "rouge1_precision": 0.08488770249277669, "rouge1_precision_stderr": 0.001509209602333853, "rouge1_recall": 0.19813172755834577, "rouge1_recall_stderr": 0.0032474572521749194, "rouge2_fmeasure": 0.01153407231838246, "rouge2_fmeasure_stderr": 0.0008078640211965759, "rouge2_precision": 0.008246438330541757, "rouge2_precision_stderr": 0.0005824935746807674, "rouge2_recall": 0.020228282417241533, "rouge2_recall_stderr": 0.0014323430166151545, "rougeL_fmeasure": 0.09705401381411084, "rougeL_fmeasure_stderr": 0.0014626510822599958, "rougeL_precision": 0.07026327535010932, "rougeL_precision_stderr": 0.0011182356154930892, "rougeL_recall": 0.16554056958926258, "rougeL_recall_stderr": 0.002477178559945266, "rougeLsum_fmeasure": 0.09684924566454005, "rougeLsum_fmeasure_stderr": 0.001564778913537185, "rougeLsum_precision": 0.0700549633031646, "rougeLsum_precision_stderr": 0.0011794948365267052, "rougeLsum_recall": 0.1654358655028746, "rougeLsum_recall_stderr": 0.002682749737328667}}, "1": {"article_DOC_summary": {"bleu": 0.7871536101776422, "bleu_stderr": 0.07322906159695927, "rouge1_fmeasure": 0.1288496551436097, "rouge1_fmeasure_stderr": 0.002308468910100682, "rouge1_precision": 0.09157070008924, "rouge1_precision_stderr": 0.0016830670326091262, "rouge1_recall": 0.22600669268446158, "rouge1_recall_stderr": 0.004011298506451288, "rouge2_fmeasure": 0.01870363630762986, "rouge2_fmeasure_stderr": 0.001060863444959382, "rouge2_precision": 0.013095108182729265, "rouge2_precision_stderr": 0.0007453038554987561, "rouge2_recall": 0.03410537737570771, "rouge2_recall_stderr": 0.001974452124311393, "rougeL_fmeasure": 0.10718165977213731, "rougeL_fmeasure_stderr": 0.0017615520710667569, "rougeL_precision": 0.07608507851006019, "rougeL_precision_stderr": 0.0012823580914152802, "rougeL_recall": 0.1887183804153633, "rougeL_recall_stderr": 0.0031215258317537436, "rougeLsum_fmeasure": 0.10668730515839585, "rougeLsum_fmeasure_stderr": 0.0018825214777256206, "rougeLsum_precision": 0.07570007632515688, "rougeLsum_precision_stderr": 0.0013649234652551023, "rougeLsum_recall": 0.18794434033398466, "rougeLsum_recall_stderr": 0.0033324412278060815}}, "2": {"article_DOC_summary": {"bleu": 0.8336246828929866, "bleu_stderr": 0.054821058679941914, "rouge1_fmeasure": 0.13584092927811475, "rouge1_fmeasure_stderr": 0.0023894507967195068, "rouge1_precision": 0.09606365410995298, "rouge1_precision_stderr": 0.0017289752531892755, "rouge1_recall": 0.24109076789716347, "rouge1_recall_stderr": 0.004249367408997067, "rouge2_fmeasure": 0.02149971321984745, "rouge2_fmeasure_stderr": 0.0011072351066159815, "rouge2_precision": 0.014965724288431435, "rouge2_precision_stderr": 0.0007708705714099285, "rouge2_recall": 0.03961712621671652, "rouge2_recall_stderr": 0.002080367490270864, "rougeL_fmeasure": 0.11064742747597277, "rougeL_fmeasure_stderr": 0.001786202252874528, "rougeL_precision": 0.07816557157605167, "rougeL_precision_stderr": 0.0012876620444083977, "rougeL_recall": 0.1971051484974429, "rougeL_recall_stderr": 0.003253803320603986, "rougeLsum_fmeasure": 0.11284377229419564, "rougeLsum_fmeasure_stderr": 0.0019429709645863045, "rougeLsum_precision": 0.07965005221915179, "rougeLsum_precision_stderr": 0.0013920378474243525, "rougeLsum_recall": 0.20132764801033967, "rougeLsum_recall_stderr": 0.0035528940991881495}}, "3": {"article_DOC_summary": {"bleu": 0.5212759581952935, "bleu_stderr": 0.05480106482512421, "rouge1_fmeasure": 0.10839918090666271, "rouge1_fmeasure_stderr": 0.002134922951671863, "rouge1_precision": 0.07905639335534033, "rouge1_precision_stderr": 0.0016483058479411423, "rouge1_recall": 0.18619242699929947, "rouge1_recall_stderr": 0.003677925190637863, "rouge2_fmeasure": 0.012510492483954895, "rouge2_fmeasure_stderr": 0.0008578414376504718, "rouge2_precision": 0.00897105955837582, "rouge2_precision_stderr": 0.0006141464794685609, "rouge2_recall": 0.0220462775032213, "rouge2_recall_stderr": 0.00156983945630155, "rougeL_fmeasure": 0.09293093435452444, "rougeL_fmeasure_stderr": 0.0017059385683015748, "rougeL_precision": 0.06776336855389609, "rougeL_precision_stderr": 0.0013482428780685152, "rougeL_recall": 0.1599342621350528, "rougeL_recall_stderr": 0.002950049745796865, "rougeLsum_fmeasure": 0.09195844966907195, "rougeLsum_fmeasure_stderr": 0.001763679018061731, "rougeLsum_precision": 0.06698510588871204, "rougeLsum_precision_stderr": 0.0013678776526961345, "rougeLsum_recall": 0.15844453034703956, "rougeLsum_recall_stderr": 0.003088521487230944}}, "4": {"article_DOC_summary": {"bleu": 0.27307893333920474, "bleu_stderr": 0.07257669784005619, "rouge1_fmeasure": 0.029257582846789993, "rouge1_fmeasure_stderr": 0.0017353864694049216, "rouge1_precision": 0.024667231581309436, "rouge1_precision_stderr": 0.0016446798164940699, "rouge1_recall": 0.04578732040023698, "rouge1_recall_stderr": 0.0027949727542311478, "rouge2_fmeasure": 0.0029439626544144923, "rouge2_fmeasure_stderr": 0.00046267375464988764, "rouge2_precision": 0.0025778362583607642, "rouge2_precision_stderr": 0.0005426786000843293, "rouge2_recall": 0.005056626447538328, "rouge2_recall_stderr": 0.0008909529686166358, "rougeL_fmeasure": 0.025397669122061574, "rougeL_fmeasure_stderr": 0.001477824062133286, "rougeL_precision": 0.021633378868917415, "rougeL_precision_stderr": 0.0014766881089830067, "rougeL_recall": 0.039954198597200195, "rougeL_recall_stderr": 0.002427151773773013, "rougeLsum_fmeasure": 0.024937374055980524, "rougeLsum_fmeasure_stderr": 0.0014662690411890744, "rougeLsum_precision": 0.021302386042875585, "rougeLsum_precision_stderr": 0.0014719776290891184, "rougeLsum_recall": 0.039097440525313706, "rougeLsum_recall_stderr": 0.0023851152450321484}}, "5": {"article_DOC_summary": {"bleu": 8.502508359632142e-38, "bleu_stderr": 1.6224422585741e-33, "rouge1_fmeasure": 0.0015539306240835566, "rouge1_fmeasure_stderr": 0.00042422089610158747, "rouge1_precision": 0.0016784285108223424, "rouge1_precision_stderr": 0.00046933173483779724, "rouge1_recall": 0.001546401819187842, "rouge1_recall_stderr": 0.00043408961008404196, "rouge2_fmeasure": 5.19777535214928e-05, "rouge2_fmeasure_stderr": 5.1977753521493134e-05, "rouge2_precision": 5.04489960649783e-05, "rouge2_precision_stderr": 5.044899606497852e-05, "rouge2_recall": 5.360205831903945e-05, "rouge2_recall_stderr": 5.360205831903959e-05, "rougeL_fmeasure": 0.0014344496420688818, "rougeL_fmeasure_stderr": 0.0003937675400802755, "rougeL_precision": 0.0015267311823769563, "rougeL_precision_stderr": 0.00042072964022598367, "rougeL_recall": 0.001446281044627963, "rougeL_recall_stderr": 0.000414342893101565, "rougeLsum_fmeasure": 0.001442691159744362, "rougeLsum_fmeasure_stderr": 0.0003927017017637451, "rougeLsum_precision": 0.0015235781201228954, "rougeLsum_precision_stderr": 0.0004126564052488673, "rougeLsum_recall": 0.0014595891418657935, "rougeLsum_recall_stderr": 0.00041568179279564054}}}} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0138442ddb7d382a72c607241668de45eaf0c2c9 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5083902070671548, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03989468844910082 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07528694149270759, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001485415514756619 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3780318171066232, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005223104661769293 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11728631797246779, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019247654134608228 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03516210005523054, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009572816724584412 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18705373069728767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037081482160342696 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05468830386197015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012066786386101694 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07002377028639689, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013498282472237842 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3530593907516912, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00490724418597446 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10921255839870517, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017509846379559321 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07051076285503056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014029195285359902 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3526831574759125, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004764676209885756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10965304015302584, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00179658805334725 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e38e962836e2b0407fad2bcae8362ef1e1818b04 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.525451157972169, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.020807029076595943 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07578151398935791, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014658497562735056 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3776088572433392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005222009047076881 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11815166842945922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019539933108443297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035216953195913794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008936559179157622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18552571500246623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003651941896665819 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05505652232011781, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012284406115594482 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07009792254358821, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013082058038069526 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.35054744472531996, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004791505363230279 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1094938648119312, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017598889432284988 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07078544810917187, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013654984516325997 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.35187230618740717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004734030010907423 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11026920423657147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018110860096435387 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..520c66c53a615ec0ca9314f4bff4a2977370ae93 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0073229441233485515, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0007507591641053894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.01188187354308511, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001193460962416848 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.007782363958910315, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007494140926600601 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0014793812141488027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000229663938081861 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.002825244488953548, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005238543235168785 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0016405694963126324, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00023525203173163016 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.005727289612377416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0005861714348780361 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009446302855839223, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009620615699727216 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006018804038099202, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005623113681535245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.00679434254221087, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0006951873055254002 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.011060401872827133, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011163157558010933 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.007193842982164766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006896239402426108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 5.090669570430738e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 8.489969077479997e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..1f602f714c1a4e5bfc64485a90750a0bbec3153f --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.255556843183474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14306801493502502 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.40133894612714033, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022786254106147374 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4353400028716086, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025934401772713216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.40223710706751803, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019113695305159313 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.17167615815004073, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0017373085671162421 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18892368452661648, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002036271603133117 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17238412451550736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001647066730421544 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2895776384246929, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018377899797485326 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.31561872620267456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002186944465790045 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2905126850575018, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016066395148831222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.33661571162841014, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0021532181771208945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36620748100065265, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002517396529311699 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3376761630795252, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001906552910299397 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8ad8e790515278060e74c1ee3b1cc10a20d155d5 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.453538794646896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1018770051872085 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.40269265483237937, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022414598319616666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4321840404996293, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002501669112723814 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.40172960710680156, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0018233890583034939 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.17489910198304812, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001749578738044224 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18884294973750174, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001942308180411026 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17413364387783356, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001607923940126345 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.29510908654175555, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00188243407356544 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.31785854835047744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021450790852441397 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.29458902141806165, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016001785328170475 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3399297127876474, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002185077308803786 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.364926897184195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00242487417897827 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3390409817775856, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018684469365198332 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0a7a35e91aec1ee3029d132af0620724075a986c --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.024667231581309436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016446798164940699 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.04578732040023698, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0027949727542311478 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.029257582846789993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0017353864694049216 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0025778362583607642, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0005426786000843293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.005056626447538328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008909529686166358 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0029439626544144923, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00046267375464988764 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.021633378868917415, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014766881089830067 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.039954198597200195, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002427151773773013 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.025397669122061574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001477824062133286 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.021302386042875585, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014719776290891184 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.039097440525313706, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0023851152450321484 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.024937374055980524, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0014662690411890744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.27307893333920474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07257669784005619 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..df6b00c1a7e7465b36e730b3244bd57ae754de24 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/generation/slim.4b284b12bc4seed2_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0016784285108223424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00046933173483779724 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.001546401819187842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00043408961008404196 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0015539306240835566, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00042422089610158747 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 5.04489960649783e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 5.044899606497852e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 5.360205831903945e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 5.360205831903959e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 5.19777535214928e-05, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 5.1977753521493134e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0015267311823769563, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00042072964022598367 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.001446281044627963, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.000414342893101565 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0014344496420688818, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0003937675400802755 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0015235781201228954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0004126564052488673 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0014595891418657935, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00041568179279564054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.001442691159744362, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0003927017017637451 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 8.502508359632142e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.6224422585741e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed2/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..3e990ed1dfe607c5ab6afb9fe35fd4b62573b878 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.01500870618212173,0 +anli_r2,acc,0.343,0.015019206922356953,0 +anli_r3,acc,0.3491666666666667,0.013767075395077249,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545797,0 +arc_challenge,acc_norm,0.2960750853242321,0.013340916085246268,0 +arc_easy,acc,0.5526094276094277,0.010202832385415644,0 +arc_easy,acc_norm,0.5004208754208754,0.010259779886094424,0 +boolq,acc,0.5770642201834862,0.008640558744656426,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.24338624338624337,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.4737104162517427,0.004982879340691403,0 +hellaswag,acc_norm,0.616211909978092,0.004853134271547751,0 +piqa,acc,0.7393906420021763,0.010241826155811625,0 +piqa,acc_norm,0.749183895538629,0.010113869547069046,0 +rte,acc,0.555956678700361,0.029907396333795994,0 +sciq,acc,0.84,0.011598902298689005,0 +sciq,acc_norm,0.758,0.013550631705555954,0 +storycloze_2016,acc,0.7076429716729022,0.010518239729787743,0 +winogrande,acc,0.5943172849250198,0.01380020633601421,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0_lm-eval_global_step80108_2023-02-24-15-37-27_0shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0_lm-eval_global_step80108_2023-02-24-15-37-27_0shots_backup.json deleted file mode 100644 index cad9c7b2a6c240d582231cff65869c2c0d613181..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_0_lm-eval_global_step80108_2023-02-24-15-37-27_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.01500870618212173 - }, - "anli_r2": { - "acc": 0.343, - "acc_stderr": 0.015019206922356953 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077249 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.24338624338624337 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4737104162517427, - "acc_stderr": 0.004982879340691403, - "acc_norm": 0.616211909978092, - "acc_norm_stderr": 0.004853134271547751 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795994 - }, - "winogrande": { - "acc": 0.5943172849250198, - "acc_stderr": 0.01380020633601421 - }, - "storycloze_2016": { - "acc": 0.7076429716729022, - "acc_stderr": 0.010518239729787743 - }, - "boolq": { - "acc": 0.5770642201834862, - "acc_stderr": 0.008640558744656426 - }, - "arc_easy": { - "acc": 0.5526094276094277, - "acc_stderr": 0.010202832385415644, - "acc_norm": 0.5004208754208754, - "acc_norm_stderr": 0.010259779886094424 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545797, - "acc_norm": 0.2960750853242321, - "acc_norm_stderr": 0.013340916085246268 - }, - "sciq": { - "acc": 0.84, - "acc_stderr": 0.011598902298689005, - "acc_norm": 0.758, - "acc_norm_stderr": 0.013550631705555954 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811625, - "acc_norm": 0.749183895538629, - "acc_norm_stderr": 0.010113869547069046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..e7ed30518b3df044c3355653e4b3265235343336 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.325,0.014818724459095527,0 +anli_r3,acc,0.35583333333333333,0.013826518748493314,0 +arc_challenge,acc,0.2687713310580205,0.012955065963710691,0 +arc_challenge,acc_norm,0.29948805460750855,0.013385021637313574,0 +arc_easy,acc,0.577020202020202,0.010137328382209094,0 +arc_easy,acc_norm,0.5315656565656566,0.010239317603199507,0 +boolq,acc,0.598776758409786,0.008572708337178997,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.40095238095238095,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4733120892252539,0.00498266845211894,0 +hellaswag,acc_norm,0.6218880701055567,0.0048392473326060465,0 +piqa,acc,0.7578890097932536,0.009994371269104381,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.592057761732852,0.029581952519606193,0 +sciq,acc,0.837,0.011686212712746839,0 +sciq,acc_norm,0.788,0.012931481864938034,0 +storycloze_2016,acc,0.7012292891501871,0.010584692134739969,0 +winogrande,acc,0.580110497237569,0.013870943986310393,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1_lm-eval_global_step80108_2023-02-24-15-37-27_1shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1_lm-eval_global_step80108_2023-02-24-15-37-27_1shots_backup.json deleted file mode 100644 index 08cfbbdd32322fb772efecdc673ba229a0103346..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_1_lm-eval_global_step80108_2023-02-24-15-37-27_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095527 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.013826518748493314 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.40095238095238095 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4733120892252539, - "acc_stderr": 0.00498266845211894, - "acc_norm": 0.6218880701055567, - "acc_norm_stderr": 0.0048392473326060465 - }, - "rte": { - "acc": 0.592057761732852, - "acc_stderr": 0.029581952519606193 - }, - "winogrande": { - "acc": 0.580110497237569, - "acc_stderr": 0.013870943986310393 - }, - "storycloze_2016": { - "acc": 0.7012292891501871, - "acc_stderr": 0.010584692134739969 - }, - "boolq": { - "acc": 0.598776758409786, - "acc_stderr": 0.008572708337178997 - }, - "arc_easy": { - "acc": 0.577020202020202, - "acc_stderr": 0.010137328382209094, - "acc_norm": 0.5315656565656566, - "acc_norm_stderr": 0.010239317603199507 - }, - "arc_challenge": { - "acc": 0.2687713310580205, - "acc_stderr": 0.012955065963710691, - "acc_norm": 0.29948805460750855, - "acc_norm_stderr": 0.013385021637313574 - }, - "sciq": { - "acc": 0.837, - "acc_stderr": 0.011686212712746839, - "acc_norm": 0.788, - "acc_norm_stderr": 0.012931481864938034 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104381, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..8fe7d9724c19795faf92e1805e4f61ed3515e409 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121728,0 +anli_r2,acc,0.323,0.014794927843348637,0 +anli_r3,acc,0.3358333333333333,0.013639261190932882,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858112,0 +arc_challenge,acc_norm,0.30716723549488056,0.013481034054980943,0 +arc_easy,acc,0.5837542087542088,0.010114819404500867,0 +arc_easy,acc_norm,0.5521885521885522,0.010203742451111525,0 +boolq,acc,0.6,0.008568368985904962,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.26656990807934206,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4715196176060546,0.004981680090303701,0 +hellaswag,acc_norm,0.6190997809201354,0.004846156699486671,0 +piqa,acc,0.7470076169749728,0.01014288869886246,0 +piqa,acc_norm,0.7573449401523396,0.01000200256970869,0 +rte,acc,0.5523465703971119,0.029931070362939526,0 +sciq,acc,0.846,0.011419913065098708,0 +sciq,acc_norm,0.806,0.012510816141264368,0 +storycloze_2016,acc,0.703901656867985,0.010557307688475123,0 +winogrande,acc,0.5753749013417522,0.013891893150264224,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2_lm-eval_global_step80108_2023-02-24-15-37-27_2shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2_lm-eval_global_step80108_2023-02-24-15-37-27_2shots_backup.json deleted file mode 100644 index fa393826673bdbf296d8f8e4a55c17d7345d72d2..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_2_lm-eval_global_step80108_2023-02-24-15-37-27_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121728 - }, - "anli_r2": { - "acc": 0.323, - "acc_stderr": 0.014794927843348637 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932882 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.26656990807934206 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4715196176060546, - "acc_stderr": 0.004981680090303701, - "acc_norm": 0.6190997809201354, - "acc_norm_stderr": 0.004846156699486671 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939526 - }, - "winogrande": { - "acc": 0.5753749013417522, - "acc_stderr": 0.013891893150264224 - }, - "storycloze_2016": { - "acc": 0.703901656867985, - "acc_stderr": 0.010557307688475123 - }, - "boolq": { - "acc": 0.6, - "acc_stderr": 0.008568368985904962 - }, - "arc_easy": { - "acc": 0.5837542087542088, - "acc_stderr": 0.010114819404500867, - "acc_norm": 0.5521885521885522, - "acc_norm_stderr": 0.010203742451111525 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858112, - "acc_norm": 0.30716723549488056, - "acc_norm_stderr": 0.013481034054980943 - }, - "sciq": { - "acc": 0.846, - "acc_stderr": 0.011419913065098708, - "acc_norm": 0.806, - "acc_norm_stderr": 0.012510816141264368 - }, - "piqa": { - "acc": 0.7470076169749728, - "acc_stderr": 0.01014288869886246, - "acc_norm": 0.7573449401523396, - "acc_norm_stderr": 0.01000200256970869 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..e81e0d7f3e6300b0f9f342cdb809185c3e726983 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229875,0 +anli_r2,acc,0.331,0.014888272588203936,0 +anli_r3,acc,0.35583333333333333,0.013826518748493324,0 +arc_challenge,acc,0.26791808873720135,0.012942030195136442,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0 +arc_easy,acc,0.5900673400673401,0.010091953527506246,0 +arc_easy,acc_norm,0.5627104377104377,0.01017876842932159,0 +boolq,acc,0.6070336391437309,0.00854233514797057,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.35664983164983166,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.47032463652658835,0.004980985384152898,0 +hellaswag,acc_norm,0.6199960167297351,0.004843954338451451,0 +piqa,acc,0.7524483133841132,0.010069703966857108,0 +piqa,acc_norm,0.7616974972796517,0.009940334245876224,0 +rte,acc,0.5740072202166066,0.029764956741777645,0 +sciq,acc,0.853,0.011203415395160331,0 +sciq,acc_norm,0.813,0.01233625482807413,0 +storycloze_2016,acc,0.7055050774986639,0.010540668963800296,0 +winogrande,acc,0.5769534333070244,0.013885055359056472,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3_lm-eval_global_step80108_2023-02-24-15-37-27_3shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3_lm-eval_global_step80108_2023-02-24-15-37-27_3shots_backup.json deleted file mode 100644 index 48ebc743641b38b1a7b77aa0a0804fec6da173f1..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_3_lm-eval_global_step80108_2023-02-24-15-37-27_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229875 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203936 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.013826518748493324 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.35664983164983166 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.47032463652658835, - "acc_stderr": 0.004980985384152898, - "acc_norm": 0.6199960167297351, - "acc_norm_stderr": 0.004843954338451451 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.029764956741777645 - }, - "winogrande": { - "acc": 0.5769534333070244, - "acc_stderr": 0.013885055359056472 - }, - "storycloze_2016": { - "acc": 0.7055050774986639, - "acc_stderr": 0.010540668963800296 - }, - "boolq": { - "acc": 0.6070336391437309, - "acc_stderr": 0.00854233514797057 - }, - "arc_easy": { - "acc": 0.5900673400673401, - "acc_stderr": 0.010091953527506246, - "acc_norm": 0.5627104377104377, - "acc_norm_stderr": 0.01017876842932159 - }, - "arc_challenge": { - "acc": 0.26791808873720135, - "acc_stderr": 0.012942030195136442, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623501 - }, - "sciq": { - "acc": 0.853, - "acc_stderr": 0.011203415395160331, - "acc_norm": 0.813, - "acc_norm_stderr": 0.01233625482807413 - }, - "piqa": { - "acc": 0.7524483133841132, - "acc_stderr": 0.010069703966857108, - "acc_norm": 0.7616974972796517, - "acc_norm_stderr": 0.009940334245876224 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..17819de4ad281501d3f5d39adf5c7c3222c1b55f --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363935,0 +anli_r2,acc,0.337,0.014955087918653603,0 +anli_r3,acc,0.35,0.013774667009018552,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.30802047781569963,0.01349142951729204,0 +arc_easy,acc,0.593013468013468,0.010080695355466598,0 +arc_easy,acc_norm,0.5568181818181818,0.010193324837773497,0 +boolq,acc,0.5975535168195719,0.008576992126012484,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.29090909090909095,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4698267277434774,0.004980687467486101,0 +hellaswag,acc_norm,0.6165106552479586,0.004852420856631488,0 +piqa,acc,0.7513601741022851,0.010084511234296857,0 +piqa,acc_norm,0.76550598476605,0.009885203143240536,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.855,0.011139977517890132,0 +sciq,acc_norm,0.814,0.0123107902084128,0 +storycloze_2016,acc,0.7108498129342598,0.010484068799942061,0 +winogrande,acc,0.5864246250986582,0.013840971763195303,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4_lm-eval_global_step80108_2023-02-24-15-37-27_4shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4_lm-eval_global_step80108_2023-02-24-15-37-27_4shots_backup.json deleted file mode 100644 index 329e0bac2e7ad48b4b923b60cb0c1f45ad7a94dd..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_4_lm-eval_global_step80108_2023-02-24-15-37-27_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363935 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653603 - }, - "anli_r3": { - "acc": 0.35, - "acc_stderr": 0.013774667009018552 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.29090909090909095 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4698267277434774, - "acc_stderr": 0.004980687467486101, - "acc_norm": 0.6165106552479586, - "acc_norm_stderr": 0.004852420856631488 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195303 - }, - "storycloze_2016": { - "acc": 0.7108498129342598, - "acc_stderr": 0.010484068799942061 - }, - "boolq": { - "acc": 0.5975535168195719, - "acc_stderr": 0.008576992126012484 - }, - "arc_easy": { - "acc": 0.593013468013468, - "acc_stderr": 0.010080695355466598, - "acc_norm": 0.5568181818181818, - "acc_norm_stderr": 0.010193324837773497 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.30802047781569963, - "acc_norm_stderr": 0.01349142951729204 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.011139977517890132, - "acc_norm": 0.814, - "acc_norm_stderr": 0.0123107902084128 - }, - "piqa": { - "acc": 0.7513601741022851, - "acc_stderr": 0.010084511234296857, - "acc_norm": 0.76550598476605, - "acc_norm_stderr": 0.009885203143240536 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5.csv b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..bf7ca695c779e9603a7d19a8d8dea80bcf62b417 --- /dev/null +++ b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.353,0.015120172605483692,0 +anli_r2,acc,0.315,0.014696631960792503,0 +anli_r3,acc,0.3425,0.013704669762934727,0 +arc_challenge,acc,0.2858361774744027,0.013203196088537369,0 +arc_challenge,acc_norm,0.29948805460750855,0.013385021637313572,0 +arc_easy,acc,0.5921717171717171,0.010083950240041214,0 +arc_easy,acc_norm,0.5580808080808081,0.010190328123071765,0 +boolq,acc,0.6067278287461774,0.00854350553741787,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.32716049382716045,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.46703843855805616,0.004978927164792884,0 +hellaswag,acc_norm,0.6155148376817368,0.004854791378657001,0 +piqa,acc,0.749183895538629,0.010113869547069044,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804448,0 +rte,acc,0.5740072202166066,0.02976495674177765,0 +sciq,acc,0.865,0.010811655372416051,0 +sciq,acc_norm,0.834,0.011772110370812189,0 +storycloze_2016,acc,0.7076429716729022,0.01051823972978774,0 +winogrande,acc,0.5643251775848461,0.013935709739615713,0 diff --git a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5_lm-eval_global_step80108_2023-02-24-15-37-27_5shots_backup.json b/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5_lm-eval_global_step80108_2023-02-24-15-37-27_5shots_backup.json deleted file mode 100644 index 6f4016506d3756e7477d39d42e1faeffeb147efc..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed2/evaluation/rankeval/4b284b12bc4seed2_5_lm-eval_global_step80108_2023-02-24-15-37-27_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.353, - "acc_stderr": 0.015120172605483692 - }, - "anli_r2": { - "acc": 0.315, - "acc_stderr": 0.014696631960792503 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934727 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.32716049382716045 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.46703843855805616, - "acc_stderr": 0.004978927164792884, - "acc_norm": 0.6155148376817368, - "acc_norm_stderr": 0.004854791378657001 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.02976495674177765 - }, - "winogrande": { - "acc": 0.5643251775848461, - "acc_stderr": 0.013935709739615713 - }, - "storycloze_2016": { - "acc": 0.7076429716729022, - "acc_stderr": 0.01051823972978774 - }, - "boolq": { - "acc": 0.6067278287461774, - "acc_stderr": 0.00854350553741787 - }, - "arc_easy": { - "acc": 0.5921717171717171, - "acc_stderr": 0.010083950240041214, - "acc_norm": 0.5580808080808081, - "acc_norm_stderr": 0.010190328123071765 - }, - "arc_challenge": { - "acc": 0.2858361774744027, - "acc_stderr": 0.013203196088537369, - "acc_norm": 0.29948805460750855, - "acc_norm_stderr": 0.013385021637313572 - }, - "sciq": { - "acc": 0.865, - "acc_stderr": 0.010811655372416051, - "acc_norm": 0.834, - "acc_norm_stderr": 0.011772110370812189 - }, - "piqa": { - "acc": 0.749183895538629, - "acc_stderr": 0.010113869547069044, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804448 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2c0f77633aa08871a3beef15effc9c35aa3ebc49 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.38932054005592026, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.033555755060432874}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07700762746532616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017574979074395903}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32659511083930237, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004731650619590043}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11567054917865605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020244917898064024}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03638446401446654, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010787709184703832}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16083759865432445, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003301779480848987}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05486406523084765, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001296868906653324}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07349742960861025, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015742317195240577}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3166189013286804, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004624777728623184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1109504406851204, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018678472410080015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07279482633328832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016761377220670234}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3093845485351636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044227058124167145}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10923439857735848, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018915212851263181}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..068d7bef2ebda5f543bbeb093fb95f325232e3f4 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4010897789440996, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.035896717451801716}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07680237479305678, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015098564819853281}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33534169873311565, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004781931847971189}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11699641676828608, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019738238820560176}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03628266659322727, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000933027012126572}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16628641951511963, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033828151468031586}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.055756584513908414, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001272076817653578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07343722417982688, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013890074049552911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3242524897206685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004652592170560419}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11218413128284264, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018370252279043208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07304381756570862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014318516789194186}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.31848926688825263, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004464167390774732}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11118263806943471, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00186587403392927}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7caf9111e6d42f747d820316eb0159d327143a12 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.42050495009420186, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.032401125485335705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07641858629031524, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001512634713162659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33877843978124605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004754821881470426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11702064522926466, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019445019569898875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.036264403146498855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009035954944938369}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1690812614534248, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003434599850442417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05594462621018749, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012491415665991895}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07283896117332733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013782370339395086}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3264281887874627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004614214472784931}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11189801577003984, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018101065557781222}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07258450279456244, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001408393730596637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3215563384939613, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004418413263595925}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11116640622510127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018282586995381895}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..26fb7b87fbb9ef2b31dc7d5b166750eda07d8d13 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4159282397373365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028655242365073413}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07708995134838893, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016510440818257972}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3354474085870324, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00468185355577616}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11669978392247876, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001947629137421236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.036292243594643316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009640743250760637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1671806366343255, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033800199116181236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05573726170555429, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001260032003769061}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07334583374730402, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015569789464435146}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3219883012710275, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004531388461994007}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11124128735569962, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018188725421961862}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0731288278640945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001584414887403581}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3175376346502283, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004360960463880489}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1105408760944851, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018355827239676303}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fb3244dea83f438988cb178ee6147666c18d7863 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.45977953222907975, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.040383389212127606}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07912418599977926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016198563555509216}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3469232249585729, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004748846708374396}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12037862744280613, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019934979793417573}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03786394920640546, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009914452911534558}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1738267805541333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0034485131641693997}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.057885613023243146, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012856906291534122}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07493423869370922, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001498740213643017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3315897347595549, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004565989220241603}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11423756257112685, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001835936651182981}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07489164236007877, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015400968477694266}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.32772172422348506, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0044015503137291995}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11380829873927818, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001868863082530539}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..756de295c2ad7077d4cebfe26e0dc4daad2d1c89 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4100101691229707, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.038169326985519704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.0761749476924903, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015866409368917428}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33694404520159127, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004725880718920406}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11616969508673808, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001954781890870284}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03627352196162682, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000977188491579374}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.167813784642509, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033985471522842725}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.055555746394993916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012524254498175414}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07259356194973668, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014842070922326297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3238850222294547, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00455273082293743}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11092231794095461, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018208627331043194}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07239284886715931, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015076194366894973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3201817418126636, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004401296672435449}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11030205012975042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018300168280422182}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..ceb1afad4b1db2a53db2ccb371c89bc40f82ff2b --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.09250196886098348, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012387826019604092}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.12905376314562253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013091825113428792}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.09998460503837092, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010700519248261517}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.004542792640061134, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00021902406054084622}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.005267988260289562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002641606109254951}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004548025347793951, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00021301188784611375}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.0806751359892451, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000988255429146176}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11638678461084004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0011920958367752498}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0884294786747584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008862698881090451}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.08931760733909207, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011977856331501172}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.12471118748837833, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012621097866983477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.09653806771590727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010284292205864202}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.06823499187294975, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02672839530839174}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d13fa2f618ad43c63887c2a0ddbbe470dd5690b3 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.10227535255850072, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015211441148919019}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.1666238343075504, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002113191459192441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1176727050001766, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015061256733290465}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.010133390687879478, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00045221775358988313}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.017278009943298343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008307221313804175}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.011758190082089873, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005161253570713452}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.0840035133484757, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00108582843746046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1409822252220471, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016244906370306395}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0977465022371193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010778296010975216}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.09563245107317732, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014144569423562025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15636027194288163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001975628206486446}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.11014610167130383, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014009647264510289}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.627587943269457, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03752664558072816}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7248505e2eb4be6435d1a5a4d57d05a57d13c96d --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.13533522102182108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018951373520473378}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.22856932435608124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027080432167508025}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.15805650713785413, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019004938271445784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.023543013388107385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000717310361867752}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.040832806012490945, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001298377746871254}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.02760738077834664, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008049340257416579}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.10293794994726356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012437549972814086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1802214254693049, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020107380760737443}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12169470238796481, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012563483854001994}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.12611401083224677, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017609062395669717}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21348564535890185, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025299144255695695}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.14735538469349385, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017649380102418163}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.3448790104080863, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06809231563626762}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2d4b5f515052bf0a9db9f3ee6e680e097b97b306 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.12776173884849967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002166703255999288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2075725639546676, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0030329813736934214}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.14320563335091396, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020698343768995294}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.022636857885145915, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007440116219601286}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0397256121665061, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001322773196274425}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.026283043574202986, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0008159202511212215}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.09657179764839124, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015732121770481863}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.16159908811630536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023213429955332913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.10883270044311108, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014339155970440763}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.11866324620879036, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020254088359404664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.19292621911672542, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002834333227053115}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.13283753381327199, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019146305178959213}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.528039320029562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.036905573594762583}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa689d51c614e1b9d82c4f4dd386e00a50b35b2 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.04598706351137218, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0018221188645625001}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.0739807986804339, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002646171903509083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.04899842107573778, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017104674234628445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.008841756168682364, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006116495968030462}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.015468257674774097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0009624564354622926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.009690319746932921, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005414022669991509}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.03612427764892416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014536883711277432}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.05940229969748619, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002132196101860115}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.03836996592095654, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012956784650356502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.04261326941140918, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017129168307166089}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.06810975113263097, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024407540230673218}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.04519059371598313, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001579759147906247}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.37223171728306426, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03014516594133762}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..38aef208b5c48ca45ad028c28ced555e9ea1d932 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.007551294746242374, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008016066090893644}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.01175212455089267, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011644529839797179}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.007820849054110231, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007600843285544493}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0014154766947717414, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00021722050245300052}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.002696191748609163, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004898145017501464}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.001550055750323692, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002154946138593672}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.005847845224705761, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000595323185000189}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.009647797063887112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0009660140204759589}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006164411990577186, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0005772505558321714}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.006936670272296256, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007360722546012934}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.010824451468983412, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0010820316072561469}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.007179693639002067, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006977167152892271}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 5.013143696035085e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.50150744136576e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..c9fe2c9a0b0e9b7805b146947c9ffb16d9426554 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.09084719792363964, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023524182784253476}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.013115055244320387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003168780725030529}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.023423577684344953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005821108276955901}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.016457051237123384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0003925090249226089}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0003338211382113822, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 8.920509387302503e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.0005083527300182993, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00013811375079865392}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0003989196235540006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00010791151592722659}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.013115055244320387, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0003168780725030529}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.023423577684344953, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005821108276955901}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.016457051237123384, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0003925090249226089}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.012659903516962778, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00029850433041728446}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.0226118934358276, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005511022657739085}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.015885414699151633, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00036971385737970014}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..32481c38bb739c24ab74afb2db30710371a88c10 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.493656512933155, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06507341585485718}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3973818846475776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0026976072721873632}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4223117985183209, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002783358808801643}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.38211667792792864, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002042530024715172}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.16992657792281268, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019145879314252548}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.178124519625553, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018587557359939306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.16014559457551358, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014918368862659674}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2784440754511285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002173881277475784}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.2960008580013917, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00223721376663644}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2656366947575347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00154921779515304}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.33386523672528523, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002495920334450439}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3524456027280379, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00250316491749807}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3191928781032716, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018752599773732524}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9bfb7c916bd857654f7fb67a0a1148c24da921ce --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.70861649242996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.180728212256743}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4246829342811345, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002766857681080391}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.42902360153249547, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026637334370122148}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4028985936075782, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019913008996638617}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.188091439420101, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019527339746865022}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18884241914729302, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.001866659004782858}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17685301250165733, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015810873135005685}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2982294118028583, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0022177478762698505}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3014772250776991, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021343172062020443}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.28212495177649627, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016022344134082288}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3532941586993858, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002547817337684264}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3559974321347138, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024336261264831867}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.33457230441540664, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018977411692316212}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..68eb5c68d0bd80003f8332be8ea13fc257e99144 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.966588735110538, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12599090386040324}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.42159026565077223, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002651117861201503}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.436277297035412, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026416756972084655}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.4068505832141505, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019329494846097367}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.18776315928969667, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018907826120837835}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19380136137444595, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019180421049331063}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17997150688916153, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001592406273022251}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2954653149171501, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002167984706061261}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.30557723457112207, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021619718205346107}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.28424346982191884, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016109414932517093}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3525432765457679, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024285145519395723}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3651612369599634, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024667743399167472}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3401366732280647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018580545934390128}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a44ac36f3f845b4ddd7e8a170b7b4544c1e5bb0b --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 9.251162519243813, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.16043314004667306}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.40242023070868665, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0027480602148687965}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.436755013677424, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002566467851228464}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3969692723001995, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019731818045355485}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.176937710453779, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019212015420408378}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19058682088957968, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018838368515368652}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.17289054467515227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00158925330122968}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2782288135889908, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002208420858744797}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.30117513507531163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002108417735667974}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2734957726548441, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016372934907566048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3343952904639146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024871514512787504}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36379303588555467, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024348682831782293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.33022165481719784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001911613908677662}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e512e1788329d4def9971282449fa184ade644d5 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.324834183472209, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12201022191511865}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3770667876954104, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002761667549229222}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4384978042745568, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002504170522982392}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3826658470581734, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019397433080247473}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.16298069992158787, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.001917959278145272}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18727836348007232, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018506477395530997}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1633047480034024, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015591168107894247}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.263196349500877, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002234840925291514}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.30529924132930725, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021085926401520298}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.2662588358520316, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016519997409068458}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3161706099613126, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002502426291497884}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36828376734500556, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002374793122680808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.3210382724978914, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018713242691196597}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..4f84724ad0443033fa2a7b397a36c740400c60ff --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07457722326961545, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010813149665918967}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.18616767062678954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023262601944921035}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.10525034871077296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014205929613240243}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.004128841009305119, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00025477847907952686}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01051230796440712, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006602646260890872}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.005853296742054689, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003595212542050643}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06451608999455927, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008702064411682341}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1622598214643354, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019407556202309798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0912182400057594, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001142160193884147}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.06318245796876595, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008975507843696847}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.15913828904970537, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002044934654600564}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.08935271982090909, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001186659250809151}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.050773439614255005, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.009607935815050534}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b43d92cfd5fe8765c573271312915536d11ed29d --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07879837485772763, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0013599444284337506}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.19465714656847785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0029003189713371035}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.11065247004530705, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017812843994385369}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.006994168819790609, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004648272193447288}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.017050685879784068, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011197895634276233}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.009780488414863557, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0006426504271738838}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.06626180423614893, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00103152044172589}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.16506949443493862, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002243999574785469}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0932686816822836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001351090628116566}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.06443833252424745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0010900785870787669}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1605294884501161, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024099921996914896}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.09068800328595211, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0014406209430729104}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.37658580921002954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03383903342773015}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..8bc2de13292c3f7b3d192f5e12ae55220444d305 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0889047742823774, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015736385663893536}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.21871321971864371, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003489898707290529}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.12480432119777549, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021054789997765754}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.009892665457522843, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0006521664121306877}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.02437366955189581, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0015659961501778526}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.013892927856252131, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009041385431726432}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.07214973246105617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00116007366829089}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1791872648503229, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002631093758225792}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1015360359782745, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015523847061620607}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.07278538171522661, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001284146511191204}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.18033245428711908, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0029284834426938845}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.10235719035547435, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017287630454853157}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.5613289650024746, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06954224370193056}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c58c4a8bb434b8b58be2778d225ce1ee8e50dad0 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.09397556603911704, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017994169570713602}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.22313205728599414, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0039035927988892247}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.12920118195381422, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002284952127981043}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.01255758062915019, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007553728961037906}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.030710887496420355, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018415895742210585}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.017334182525631046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0010127873838950666}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.07567512644757472, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0013958572399353046}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.18027261644595613, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003034050646720842}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.1039819933524293, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017400868646152738}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0775560956207995, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015015791755260772}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1849635012704667, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003277420997428528}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.10658308430361232, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018787907354694993}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.7079488064244481, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07074366573560065}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..199d967ae12521545c332b1b5835a63d892787ac --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.03264760362151805, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020833568971290787}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.061777608876693486, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0037289394788052755}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.03895580700792708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002269413843476501}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0038550154412843093, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0004446008638460962}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.009089864083505584, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0010911425483655028}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0051666306699138615, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005905785787950259}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.025565238785773294, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016418229878507925}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04842391895975862, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002904785956254577}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.030269389323315635, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017191936988352017}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.026709322906345055, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017170791129193154}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.050929122969953576, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0031086814353982915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.03182638094199837, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001849989660796298}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.4082429912701259, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0779765946109533}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b93e633adc73e9d3051223840e9d82251fe4a1a4 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/agg.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.002487457924399334, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0007733944656593977}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0019894770885392197, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.000580222680643798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0021817427772383405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006550320885850304}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0003335239184295789, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002474907080024258}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.00022186932564291056, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00016062217222749233}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00026626492060156015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00019473030694105065}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0018641694287175915, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005696317403990395}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.001490936334422112, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004262092207690586}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.00163204036870977, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00048037762637480547}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0018726329116100717, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005610053066563542}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0015127742100335723, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00042560130622958597}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0016483540386329559, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004762553014171529}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 2.3068185785066287e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 6.496274000069576e-33}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..8744ffccb4a3cdbd7fff12be47cf93fa976433f8 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96ccd2db5116d7da69b19ce4cea7437183620c2c223fc0a2e51e58a58a655fe9 +size 4086631 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..d052784698a904068e5f65930e1ead8b4ae57f8f --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47606e5b6c338c73a426540bef637d9af8667b52ab44e83b644c7396a047b766 +size 5000632 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4636bd3e441b81a53b1c8a09415a1241588eea93 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce3711bae0fccb5195b968ae576841dca48e198bd8a92991cae9e6234d53896a +size 5906452 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..a4c287dd398d1ab0f56ca6acf40d8a1cb207a6cf --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:497ab699e0a9bafee589fc982d25676e169c40a16df4cc4952ff8333d36babd9 +size 6811044 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b0ff32885ee7ac965da41775667472e627a57e2b --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b6f9870e2c5ad18d7f22316dc61e909d84c8c513ed3420ea073ef37301d178 +size 7705698 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..73536302b80098ed26e0ef4c42dfa4992a496dd7 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b6b75525c7b25da77530f94c84f8cd27fb9c8420c8493fb23d516e37bbfa367 +size 8622572 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0db4963a990b04de9b5798ff9fd2a76cdd64af15 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f4cc9526db191dc5a927d8a167f4b91f2d5bf06f9a6b5acea895ad9ad7d6c8 +size 7559324 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c24fa3df0ebb53f6db752b3cd811b9b3ebb96b75 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e978ac668096d2fc6b888a33e2cac5b27ce07a2328b57ec4aa3f5680919e32f +size 13249688 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..30de1c3c59921eaf96966175e55079c7c8347c84 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983e22c423ffd7998c32c8ccb7f7bd1bb6fc93ec9c8ea396dd6b0d84df097607 +size 18885817 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..114cda5975abd1b137b01f355149f1d225626c35 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4deb9776c28daa3e6328ff29e0b8cb64630211d0757c871a18ccd081e90d783 +size 24329816 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3644452b6c7a929a6d67bb670f56f3d9293343f6 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7ab399193e3b7c313325f88289828e5a304643552ca25fcd89c4d4163e5c30e +size 29474107 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b5d0ba14256c9aa8e1f1f6b536ad7c7b98a61ef2 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae3e0c585edc63602064770731b4420b17fa3c855638d8721d68fa25c197324 +size 34800480 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..2d9c5e5271e5d940a599a023948762f03a81e033 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9720ec86f0795ab325ee95ff4069d997a8138bfc178f449f4e60987ca6d23b28 +size 4519968 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..b638fc920526b0c57769e2e2f197d1a0756baefc --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed8278d3f2376d89c7adf1daf796aa681b2f05454d95d7b7636c8c0360dcff4f +size 5175902 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1f4652c64d950d2efd2428e745b520a799c19372 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:086c8b1cd1365ab9dd663c18ca17a6b565653a902b75b8062ba757d4c65d7b53 +size 6222490 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4b3a85758fa669260516e2cf838a939134e97a50 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fb0cc3df090515f882e20cee8bc65d69e1c06965e9688d8f2db9ab4b64d99fa +size 7309995 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..edd189738b02c27947da365fd73d499e33eba124 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f464445d0b4577ddb5ddeaa088490b2a5bc488f8e1e048dcd12b7e8c188df907 +size 8419260 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..3b8a6b5ec5003d2dd3f3c1e29cfd4d86ce577109 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5049c38bbdee981f1f5314093593bf37fd43ab0c08875525497619b9ce468b65 +size 9544814 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..5edb5480bafcc84a8f11dc85275bd5aa5545af39 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d08cbdc1e68b8645b72a000f999f24813ded9f26c336b46ff207f5199fd7782 +size 2836382 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..70d31814ab92a4eb9c9587159459b3d94ca55f50 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab808919690fac8a386d9d33c4392a89fca1d1e508d7bdb9196130afac07139e +size 5100751 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..4bd2587128e83f595a580dfdefb662d8a5fd1a1a --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4a6ee1b720ea8c2d6a11e2791ba5bfc6259e4ee3ceabb601880b224210a5bea +size 7375859 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..48dfbc4e668b49b0cbc1112f434ebf52d55e78aa --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b8a04476fd24dba3c96aa2524534c94e338f68bf45a285abb2efb32cae62606 +size 9645558 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f5c59ddcd7b57e158405f091a19001ec34029993 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3f0ac602b2cab1338c19b6f9081cbf1b475da1a5082d82d2b9e65fff5db193 +size 11672175 diff --git a/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..f2b5cd6e3db33192b1591b709eea509402ab0b65 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/examples.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b0af9775f2be90b26eb1770104e6410d3eb3ce429deb86179faab148827a22 +size 13897569 diff --git a/4b284b12bc4seed3/evaluation/generation/merged.csv b/4b284b12bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..13dbde2d12a63587b5726158e85507e83ef88972 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0003989196235540006 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0003989196235540006 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.16014559457551358 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.16014559457551358 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.17685301250165733 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.17685301250165733 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.17997150688916153 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.17997150688916153 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.17289054467515227 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.17289054467515227 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1633047480034024 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1633047480034024 +e2e_nlg_cleaned,5,average,multiple,0.1422607210447402 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.005853296742054689 +gem_xsum,0,median,rouge2_fmeasure,0.005853296742054689 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.009780488414863557 +gem_xsum,1,median,rouge2_fmeasure,0.009780488414863557 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.013892927856252131 +gem_xsum,2,median,rouge2_fmeasure,0.013892927856252131 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.017334182525631046 +gem_xsum,3,median,rouge2_fmeasure,0.017334182525631046 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0051666306699138615 +gem_xsum,4,median,rouge2_fmeasure,0.0051666306699138615 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00026626492060156015 +gem_xsum,5,median,rouge2_fmeasure,0.00026626492060156015 +gem_xsum,5,average,multiple,0.00871563185488614 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05486406523084765 +web_nlg_en,0,median,rouge2_fmeasure,0.05486406523084765 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.055756584513908414 +web_nlg_en,1,median,rouge2_fmeasure,0.055756584513908414 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05594462621018749 +web_nlg_en,2,median,rouge2_fmeasure,0.05594462621018749 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05573726170555429 +web_nlg_en,3,median,rouge2_fmeasure,0.05573726170555429 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.057885613023243146 +web_nlg_en,4,median,rouge2_fmeasure,0.057885613023243146 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.055555746394993916 +web_nlg_en,5,median,rouge2_fmeasure,0.055555746394993916 +web_nlg_en,5,average,multiple,0.05595731617978915 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.004548025347793951 +wiki_lingua_en,0,median,rouge2_fmeasure,0.004548025347793951 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.011758190082089873 +wiki_lingua_en,1,median,rouge2_fmeasure,0.011758190082089873 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.02760738077834664 +wiki_lingua_en,2,median,rouge2_fmeasure,0.02760738077834664 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.026283043574202986 +wiki_lingua_en,3,median,rouge2_fmeasure,0.026283043574202986 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.009690319746932921 +wiki_lingua_en,4,median,rouge2_fmeasure,0.009690319746932921 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.001550055750323692 +wiki_lingua_en,5,median,rouge2_fmeasure,0.001550055750323692 +wiki_lingua_en,5,average,multiple,0.013572835879948343 diff --git a/4b284b12bc4seed3/evaluation/generation/merged.json b/4b284b12bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac2930d5a78028f1f6f24305abe18ac0c205286 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.38932054005592026, "bleu_stderr": 0.033555755060432874, "rouge1_fmeasure": 0.11567054917865605, "rouge1_fmeasure_stderr": 0.0020244917898064024, "rouge1_precision": 0.07700762746532616, "rouge1_precision_stderr": 0.0017574979074395903, "rouge1_recall": 0.32659511083930237, "rouge1_recall_stderr": 0.004731650619590043, "rouge2_fmeasure": 0.05486406523084765, "rouge2_fmeasure_stderr": 0.001296868906653324, "rouge2_precision": 0.03638446401446654, "rouge2_precision_stderr": 0.0010787709184703832, "rouge2_recall": 0.16083759865432445, "rouge2_recall_stderr": 0.003301779480848987, "rougeL_fmeasure": 0.1109504406851204, "rougeL_fmeasure_stderr": 0.0018678472410080015, "rougeL_precision": 0.07349742960861025, "rougeL_precision_stderr": 0.0015742317195240577, "rougeL_recall": 0.3166189013286804, "rougeL_recall_stderr": 0.004624777728623184, "rougeLsum_fmeasure": 0.10923439857735848, "rougeLsum_fmeasure_stderr": 0.0018915212851263181, "rougeLsum_precision": 0.07279482633328832, "rougeLsum_precision_stderr": 0.0016761377220670234, "rougeLsum_recall": 0.3093845485351636, "rougeLsum_recall_stderr": 0.0044227058124167145}}, "1": {"PALM_prompt": {"bleu": 0.4010897789440996, "bleu_stderr": 0.035896717451801716, "rouge1_fmeasure": 0.11699641676828608, "rouge1_fmeasure_stderr": 0.0019738238820560176, "rouge1_precision": 0.07680237479305678, "rouge1_precision_stderr": 0.0015098564819853281, "rouge1_recall": 0.33534169873311565, "rouge1_recall_stderr": 0.004781931847971189, "rouge2_fmeasure": 0.055756584513908414, "rouge2_fmeasure_stderr": 0.001272076817653578, "rouge2_precision": 0.03628266659322727, "rouge2_precision_stderr": 0.000933027012126572, "rouge2_recall": 0.16628641951511963, "rouge2_recall_stderr": 0.0033828151468031586, "rougeL_fmeasure": 0.11218413128284264, "rougeL_fmeasure_stderr": 0.0018370252279043208, "rougeL_precision": 0.07343722417982688, "rougeL_precision_stderr": 0.0013890074049552911, "rougeL_recall": 0.3242524897206685, "rougeL_recall_stderr": 0.004652592170560419, "rougeLsum_fmeasure": 0.11118263806943471, "rougeLsum_fmeasure_stderr": 0.00186587403392927, "rougeLsum_precision": 0.07304381756570862, "rougeLsum_precision_stderr": 0.0014318516789194186, "rougeLsum_recall": 0.31848926688825263, "rougeLsum_recall_stderr": 0.004464167390774732}}, "2": {"PALM_prompt": {"bleu": 0.42050495009420186, "bleu_stderr": 0.032401125485335705, "rouge1_fmeasure": 0.11702064522926466, "rouge1_fmeasure_stderr": 0.0019445019569898875, "rouge1_precision": 0.07641858629031524, "rouge1_precision_stderr": 0.001512634713162659, "rouge1_recall": 0.33877843978124605, "rouge1_recall_stderr": 0.004754821881470426, "rouge2_fmeasure": 0.05594462621018749, "rouge2_fmeasure_stderr": 0.0012491415665991895, "rouge2_precision": 0.036264403146498855, "rouge2_precision_stderr": 0.0009035954944938369, "rouge2_recall": 0.1690812614534248, "rouge2_recall_stderr": 0.003434599850442417, "rougeL_fmeasure": 0.11189801577003984, "rougeL_fmeasure_stderr": 0.0018101065557781222, "rougeL_precision": 0.07283896117332733, "rougeL_precision_stderr": 0.0013782370339395086, "rougeL_recall": 0.3264281887874627, "rougeL_recall_stderr": 0.004614214472784931, "rougeLsum_fmeasure": 0.11116640622510127, "rougeLsum_fmeasure_stderr": 0.0018282586995381895, "rougeLsum_precision": 0.07258450279456244, "rougeLsum_precision_stderr": 0.001408393730596637, "rougeLsum_recall": 0.3215563384939613, "rougeLsum_recall_stderr": 0.004418413263595925}}, "3": {"PALM_prompt": {"bleu": 0.4159282397373365, "bleu_stderr": 0.028655242365073413, "rouge1_fmeasure": 0.11669978392247876, "rouge1_fmeasure_stderr": 0.001947629137421236, "rouge1_precision": 0.07708995134838893, "rouge1_precision_stderr": 0.0016510440818257972, "rouge1_recall": 0.3354474085870324, "rouge1_recall_stderr": 0.00468185355577616, "rouge2_fmeasure": 0.05573726170555429, "rouge2_fmeasure_stderr": 0.001260032003769061, "rouge2_precision": 0.036292243594643316, "rouge2_precision_stderr": 0.0009640743250760637, "rouge2_recall": 0.1671806366343255, "rouge2_recall_stderr": 0.0033800199116181236, "rougeL_fmeasure": 0.11124128735569962, "rougeL_fmeasure_stderr": 0.0018188725421961862, "rougeL_precision": 0.07334583374730402, "rougeL_precision_stderr": 0.0015569789464435146, "rougeL_recall": 0.3219883012710275, "rougeL_recall_stderr": 0.004531388461994007, "rougeLsum_fmeasure": 0.1105408760944851, "rougeLsum_fmeasure_stderr": 0.0018355827239676303, "rougeLsum_precision": 0.0731288278640945, "rougeLsum_precision_stderr": 0.001584414887403581, "rougeLsum_recall": 0.3175376346502283, "rougeLsum_recall_stderr": 0.004360960463880489}}, "4": {"PALM_prompt": {"bleu": 0.45977953222907975, "bleu_stderr": 0.040383389212127606, "rouge1_fmeasure": 0.12037862744280613, "rouge1_fmeasure_stderr": 0.0019934979793417573, "rouge1_precision": 0.07912418599977926, "rouge1_precision_stderr": 0.0016198563555509216, "rouge1_recall": 0.3469232249585729, "rouge1_recall_stderr": 0.004748846708374396, "rouge2_fmeasure": 0.057885613023243146, "rouge2_fmeasure_stderr": 0.0012856906291534122, "rouge2_precision": 0.03786394920640546, "rouge2_precision_stderr": 0.0009914452911534558, "rouge2_recall": 0.1738267805541333, "rouge2_recall_stderr": 0.0034485131641693997, "rougeL_fmeasure": 0.11423756257112685, "rougeL_fmeasure_stderr": 0.001835936651182981, "rougeL_precision": 0.07493423869370922, "rougeL_precision_stderr": 0.001498740213643017, "rougeL_recall": 0.3315897347595549, "rougeL_recall_stderr": 0.004565989220241603, "rougeLsum_fmeasure": 0.11380829873927818, "rougeLsum_fmeasure_stderr": 0.001868863082530539, "rougeLsum_precision": 0.07489164236007877, "rougeLsum_precision_stderr": 0.0015400968477694266, "rougeLsum_recall": 0.32772172422348506, "rougeLsum_recall_stderr": 0.0044015503137291995}}, "5": {"PALM_prompt": {"bleu": 0.4100101691229707, "bleu_stderr": 0.038169326985519704, "rouge1_fmeasure": 0.11616969508673808, "rouge1_fmeasure_stderr": 0.001954781890870284, "rouge1_precision": 0.0761749476924903, "rouge1_precision_stderr": 0.0015866409368917428, "rouge1_recall": 0.33694404520159127, "rouge1_recall_stderr": 0.004725880718920406, "rouge2_fmeasure": 0.055555746394993916, "rouge2_fmeasure_stderr": 0.0012524254498175414, "rouge2_precision": 0.03627352196162682, "rouge2_precision_stderr": 0.000977188491579374, "rouge2_recall": 0.167813784642509, "rouge2_recall_stderr": 0.0033985471522842725, "rougeL_fmeasure": 0.11092231794095461, "rougeL_fmeasure_stderr": 0.0018208627331043194, "rougeL_precision": 0.07259356194973668, "rougeL_precision_stderr": 0.0014842070922326297, "rougeL_recall": 0.3238850222294547, "rougeL_recall_stderr": 0.00455273082293743, "rougeLsum_fmeasure": 0.11030205012975042, "rougeLsum_fmeasure_stderr": 0.0018300168280422182, "rougeLsum_precision": 0.07239284886715931, "rougeLsum_precision_stderr": 0.0015076194366894973, "rougeLsum_recall": 0.3201817418126636, "rougeLsum_recall_stderr": 0.004401296672435449}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.06823499187294975, "bleu_stderr": 0.02672839530839174, "rouge1_fmeasure": 0.09998460503837092, "rouge1_fmeasure_stderr": 0.0010700519248261517, "rouge1_precision": 0.09250196886098348, "rouge1_precision_stderr": 0.0012387826019604092, "rouge1_recall": 0.12905376314562253, "rouge1_recall_stderr": 0.0013091825113428792, "rouge2_fmeasure": 0.004548025347793951, "rouge2_fmeasure_stderr": 0.00021301188784611375, "rouge2_precision": 0.004542792640061134, "rouge2_precision_stderr": 0.00021902406054084622, "rouge2_recall": 0.005267988260289562, "rouge2_recall_stderr": 0.0002641606109254951, "rougeL_fmeasure": 0.0884294786747584, "rougeL_fmeasure_stderr": 0.0008862698881090451, "rougeL_precision": 0.0806751359892451, "rougeL_precision_stderr": 0.000988255429146176, "rougeL_recall": 0.11638678461084004, "rougeL_recall_stderr": 0.0011920958367752498, "rougeLsum_fmeasure": 0.09653806771590727, "rougeLsum_fmeasure_stderr": 0.0010284292205864202, "rougeLsum_precision": 0.08931760733909207, "rougeLsum_precision_stderr": 0.0011977856331501172, "rougeLsum_recall": 0.12471118748837833, "rougeLsum_recall_stderr": 0.0012621097866983477}}, "1": {"tldr_en": {"bleu": 0.627587943269457, "bleu_stderr": 0.03752664558072816, "rouge1_fmeasure": 0.1176727050001766, "rouge1_fmeasure_stderr": 0.0015061256733290465, "rouge1_precision": 0.10227535255850072, "rouge1_precision_stderr": 0.0015211441148919019, "rouge1_recall": 0.1666238343075504, "rouge1_recall_stderr": 0.002113191459192441, "rouge2_fmeasure": 0.011758190082089873, "rouge2_fmeasure_stderr": 0.0005161253570713452, "rouge2_precision": 0.010133390687879478, "rouge2_precision_stderr": 0.00045221775358988313, "rouge2_recall": 0.017278009943298343, "rouge2_recall_stderr": 0.0008307221313804175, "rougeL_fmeasure": 0.0977465022371193, "rougeL_fmeasure_stderr": 0.0010778296010975216, "rougeL_precision": 0.0840035133484757, "rougeL_precision_stderr": 0.00108582843746046, "rougeL_recall": 0.1409822252220471, "rougeL_recall_stderr": 0.0016244906370306395, "rougeLsum_fmeasure": 0.11014610167130383, "rougeLsum_fmeasure_stderr": 0.0014009647264510289, "rougeLsum_precision": 0.09563245107317732, "rougeLsum_precision_stderr": 0.0014144569423562025, "rougeLsum_recall": 0.15636027194288163, "rougeLsum_recall_stderr": 0.001975628206486446}}, "2": {"tldr_en": {"bleu": 1.3448790104080863, "bleu_stderr": 0.06809231563626762, "rouge1_fmeasure": 0.15805650713785413, "rouge1_fmeasure_stderr": 0.0019004938271445784, "rouge1_precision": 0.13533522102182108, "rouge1_precision_stderr": 0.0018951373520473378, "rouge1_recall": 0.22856932435608124, "rouge1_recall_stderr": 0.0027080432167508025, "rouge2_fmeasure": 0.02760738077834664, "rouge2_fmeasure_stderr": 0.0008049340257416579, "rouge2_precision": 0.023543013388107385, "rouge2_precision_stderr": 0.000717310361867752, "rouge2_recall": 0.040832806012490945, "rouge2_recall_stderr": 0.001298377746871254, "rougeL_fmeasure": 0.12169470238796481, "rougeL_fmeasure_stderr": 0.0012563483854001994, "rougeL_precision": 0.10293794994726356, "rougeL_precision_stderr": 0.0012437549972814086, "rougeL_recall": 0.1802214254693049, "rougeL_recall_stderr": 0.0020107380760737443, "rougeLsum_fmeasure": 0.14735538469349385, "rougeLsum_fmeasure_stderr": 0.0017649380102418163, "rougeLsum_precision": 0.12611401083224677, "rougeLsum_precision_stderr": 0.0017609062395669717, "rougeLsum_recall": 0.21348564535890185, "rougeLsum_recall_stderr": 0.0025299144255695695}}, "3": {"tldr_en": {"bleu": 1.528039320029562, "bleu_stderr": 0.036905573594762583, "rouge1_fmeasure": 0.14320563335091396, "rouge1_fmeasure_stderr": 0.0020698343768995294, "rouge1_precision": 0.12776173884849967, "rouge1_precision_stderr": 0.002166703255999288, "rouge1_recall": 0.2075725639546676, "rouge1_recall_stderr": 0.0030329813736934214, "rouge2_fmeasure": 0.026283043574202986, "rouge2_fmeasure_stderr": 0.0008159202511212215, "rouge2_precision": 0.022636857885145915, "rouge2_precision_stderr": 0.0007440116219601286, "rouge2_recall": 0.0397256121665061, "rouge2_recall_stderr": 0.001322773196274425, "rougeL_fmeasure": 0.10883270044311108, "rougeL_fmeasure_stderr": 0.0014339155970440763, "rougeL_precision": 0.09657179764839124, "rougeL_precision_stderr": 0.0015732121770481863, "rougeL_recall": 0.16159908811630536, "rougeL_recall_stderr": 0.0023213429955332913, "rougeLsum_fmeasure": 0.13283753381327199, "rougeLsum_fmeasure_stderr": 0.0019146305178959213, "rougeLsum_precision": 0.11866324620879036, "rougeLsum_precision_stderr": 0.0020254088359404664, "rougeLsum_recall": 0.19292621911672542, "rougeLsum_recall_stderr": 0.002834333227053115}}, "4": {"tldr_en": {"bleu": 0.37223171728306426, "bleu_stderr": 0.03014516594133762, "rouge1_fmeasure": 0.04899842107573778, "rouge1_fmeasure_stderr": 0.0017104674234628445, "rouge1_precision": 0.04598706351137218, "rouge1_precision_stderr": 0.0018221188645625001, "rouge1_recall": 0.0739807986804339, "rouge1_recall_stderr": 0.002646171903509083, "rouge2_fmeasure": 0.009690319746932921, "rouge2_fmeasure_stderr": 0.0005414022669991509, "rouge2_precision": 0.008841756168682364, "rouge2_precision_stderr": 0.0006116495968030462, "rouge2_recall": 0.015468257674774097, "rouge2_recall_stderr": 0.0009624564354622926, "rougeL_fmeasure": 0.03836996592095654, "rougeL_fmeasure_stderr": 0.0012956784650356502, "rougeL_precision": 0.03612427764892416, "rougeL_precision_stderr": 0.0014536883711277432, "rougeL_recall": 0.05940229969748619, "rougeL_recall_stderr": 0.002132196101860115, "rougeLsum_fmeasure": 0.04519059371598313, "rougeLsum_fmeasure_stderr": 0.001579759147906247, "rougeLsum_precision": 0.04261326941140918, "rougeLsum_precision_stderr": 0.0017129168307166089, "rougeLsum_recall": 0.06810975113263097, "rougeLsum_recall_stderr": 0.0024407540230673218}}, "5": {"tldr_en": {"bleu": 5.013143696035085e-07, "bleu_stderr": 9.50150744136576e-07, "rouge1_fmeasure": 0.007820849054110231, "rouge1_fmeasure_stderr": 0.0007600843285544493, "rouge1_precision": 0.007551294746242374, "rouge1_precision_stderr": 0.0008016066090893644, "rouge1_recall": 0.01175212455089267, "rouge1_recall_stderr": 0.0011644529839797179, "rouge2_fmeasure": 0.001550055750323692, "rouge2_fmeasure_stderr": 0.0002154946138593672, "rouge2_precision": 0.0014154766947717414, "rouge2_precision_stderr": 0.00021722050245300052, "rouge2_recall": 0.002696191748609163, "rouge2_recall_stderr": 0.0004898145017501464, "rougeL_fmeasure": 0.006164411990577186, "rougeL_fmeasure_stderr": 0.0005772505558321714, "rougeL_precision": 0.005847845224705761, "rougeL_precision_stderr": 0.000595323185000189, "rougeL_recall": 0.009647797063887112, "rougeL_recall_stderr": 0.0009660140204759589, "rougeLsum_fmeasure": 0.007179693639002067, "rougeLsum_fmeasure_stderr": 0.0006977167152892271, "rougeLsum_precision": 0.006936670272296256, "rougeLsum_precision_stderr": 0.0007360722546012934, "rougeLsum_recall": 0.010824451468983412, "rougeLsum_recall_stderr": 0.0010820316072561469}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.09084719792363964, "bleu_stderr": 0.023524182784253476, "rouge1_fmeasure": 0.016457051237123384, "rouge1_fmeasure_stderr": 0.0003925090249226089, "rouge1_precision": 0.013115055244320387, "rouge1_precision_stderr": 0.0003168780725030529, "rouge1_recall": 0.023423577684344953, "rouge1_recall_stderr": 0.0005821108276955901, "rouge2_fmeasure": 0.0003989196235540006, "rouge2_fmeasure_stderr": 0.00010791151592722659, "rouge2_precision": 0.0003338211382113822, "rouge2_precision_stderr": 8.920509387302503e-05, "rouge2_recall": 0.0005083527300182993, "rouge2_recall_stderr": 0.00013811375079865392, "rougeL_fmeasure": 0.016457051237123384, "rougeL_fmeasure_stderr": 0.0003925090249226089, "rougeL_precision": 0.013115055244320387, "rougeL_precision_stderr": 0.0003168780725030529, "rougeL_recall": 0.023423577684344953, "rougeL_recall_stderr": 0.0005821108276955901, "rougeLsum_fmeasure": 0.015885414699151633, "rougeLsum_fmeasure_stderr": 0.00036971385737970014, "rougeLsum_precision": 0.012659903516962778, "rougeLsum_precision_stderr": 0.00029850433041728446, "rougeLsum_recall": 0.0226118934358276, "rougeLsum_recall_stderr": 0.0005511022657739085}}, "1": {"generate_text_restaurant": {"bleu": 8.493656512933155, "bleu_stderr": 0.06507341585485718, "rouge1_fmeasure": 0.38211667792792864, "rouge1_fmeasure_stderr": 0.002042530024715172, "rouge1_precision": 0.3973818846475776, "rouge1_precision_stderr": 0.0026976072721873632, "rouge1_recall": 0.4223117985183209, "rouge1_recall_stderr": 0.002783358808801643, "rouge2_fmeasure": 0.16014559457551358, "rouge2_fmeasure_stderr": 0.0014918368862659674, "rouge2_precision": 0.16992657792281268, "rouge2_precision_stderr": 0.0019145879314252548, "rouge2_recall": 0.178124519625553, "rouge2_recall_stderr": 0.0018587557359939306, "rougeL_fmeasure": 0.2656366947575347, "rougeL_fmeasure_stderr": 0.00154921779515304, "rougeL_precision": 0.2784440754511285, "rougeL_precision_stderr": 0.002173881277475784, "rougeL_recall": 0.2960008580013917, "rougeL_recall_stderr": 0.00223721376663644, "rougeLsum_fmeasure": 0.3191928781032716, "rougeLsum_fmeasure_stderr": 0.0018752599773732524, "rougeLsum_precision": 0.33386523672528523, "rougeLsum_precision_stderr": 0.002495920334450439, "rougeLsum_recall": 0.3524456027280379, "rougeLsum_recall_stderr": 0.00250316491749807}}, "2": {"generate_text_restaurant": {"bleu": 9.70861649242996, "bleu_stderr": 0.180728212256743, "rouge1_fmeasure": 0.4028985936075782, "rouge1_fmeasure_stderr": 0.0019913008996638617, "rouge1_precision": 0.4246829342811345, "rouge1_precision_stderr": 0.002766857681080391, "rouge1_recall": 0.42902360153249547, "rouge1_recall_stderr": 0.0026637334370122148, "rouge2_fmeasure": 0.17685301250165733, "rouge2_fmeasure_stderr": 0.0015810873135005685, "rouge2_precision": 0.188091439420101, "rouge2_precision_stderr": 0.0019527339746865022, "rouge2_recall": 0.18884241914729302, "rouge2_recall_stderr": 0.001866659004782858, "rougeL_fmeasure": 0.28212495177649627, "rougeL_fmeasure_stderr": 0.0016022344134082288, "rougeL_precision": 0.2982294118028583, "rougeL_precision_stderr": 0.0022177478762698505, "rougeL_recall": 0.3014772250776991, "rougeL_recall_stderr": 0.0021343172062020443, "rougeLsum_fmeasure": 0.33457230441540664, "rougeLsum_fmeasure_stderr": 0.0018977411692316212, "rougeLsum_precision": 0.3532941586993858, "rougeLsum_precision_stderr": 0.002547817337684264, "rougeLsum_recall": 0.3559974321347138, "rougeLsum_recall_stderr": 0.0024336261264831867}}, "3": {"generate_text_restaurant": {"bleu": 9.966588735110538, "bleu_stderr": 0.12599090386040324, "rouge1_fmeasure": 0.4068505832141505, "rouge1_fmeasure_stderr": 0.0019329494846097367, "rouge1_precision": 0.42159026565077223, "rouge1_precision_stderr": 0.002651117861201503, "rouge1_recall": 0.436277297035412, "rouge1_recall_stderr": 0.0026416756972084655, "rouge2_fmeasure": 0.17997150688916153, "rouge2_fmeasure_stderr": 0.001592406273022251, "rouge2_precision": 0.18776315928969667, "rouge2_precision_stderr": 0.0018907826120837835, "rouge2_recall": 0.19380136137444595, "rouge2_recall_stderr": 0.0019180421049331063, "rougeL_fmeasure": 0.28424346982191884, "rougeL_fmeasure_stderr": 0.0016109414932517093, "rougeL_precision": 0.2954653149171501, "rougeL_precision_stderr": 0.002167984706061261, "rougeL_recall": 0.30557723457112207, "rougeL_recall_stderr": 0.0021619718205346107, "rougeLsum_fmeasure": 0.3401366732280647, "rougeLsum_fmeasure_stderr": 0.0018580545934390128, "rougeLsum_precision": 0.3525432765457679, "rougeLsum_precision_stderr": 0.0024285145519395723, "rougeLsum_recall": 0.3651612369599634, "rougeLsum_recall_stderr": 0.0024667743399167472}}, "4": {"generate_text_restaurant": {"bleu": 9.251162519243813, "bleu_stderr": 0.16043314004667306, "rouge1_fmeasure": 0.3969692723001995, "rouge1_fmeasure_stderr": 0.0019731818045355485, "rouge1_precision": 0.40242023070868665, "rouge1_precision_stderr": 0.0027480602148687965, "rouge1_recall": 0.436755013677424, "rouge1_recall_stderr": 0.002566467851228464, "rouge2_fmeasure": 0.17289054467515227, "rouge2_fmeasure_stderr": 0.00158925330122968, "rouge2_precision": 0.176937710453779, "rouge2_precision_stderr": 0.0019212015420408378, "rouge2_recall": 0.19058682088957968, "rouge2_recall_stderr": 0.0018838368515368652, "rougeL_fmeasure": 0.2734957726548441, "rougeL_fmeasure_stderr": 0.0016372934907566048, "rougeL_precision": 0.2782288135889908, "rougeL_precision_stderr": 0.002208420858744797, "rougeL_recall": 0.30117513507531163, "rougeL_recall_stderr": 0.002108417735667974, "rougeLsum_fmeasure": 0.33022165481719784, "rougeLsum_fmeasure_stderr": 0.001911613908677662, "rougeLsum_precision": 0.3343952904639146, "rougeLsum_precision_stderr": 0.0024871514512787504, "rougeLsum_recall": 0.36379303588555467, "rougeLsum_recall_stderr": 0.0024348682831782293}}, "5": {"generate_text_restaurant": {"bleu": 8.324834183472209, "bleu_stderr": 0.12201022191511865, "rouge1_fmeasure": 0.3826658470581734, "rouge1_fmeasure_stderr": 0.0019397433080247473, "rouge1_precision": 0.3770667876954104, "rouge1_precision_stderr": 0.002761667549229222, "rouge1_recall": 0.4384978042745568, "rouge1_recall_stderr": 0.002504170522982392, "rouge2_fmeasure": 0.1633047480034024, "rouge2_fmeasure_stderr": 0.0015591168107894247, "rouge2_precision": 0.16298069992158787, "rouge2_precision_stderr": 0.001917959278145272, "rouge2_recall": 0.18727836348007232, "rouge2_recall_stderr": 0.0018506477395530997, "rougeL_fmeasure": 0.2662588358520316, "rougeL_fmeasure_stderr": 0.0016519997409068458, "rougeL_precision": 0.263196349500877, "rougeL_precision_stderr": 0.002234840925291514, "rougeL_recall": 0.30529924132930725, "rougeL_recall_stderr": 0.0021085926401520298, "rougeLsum_fmeasure": 0.3210382724978914, "rougeLsum_fmeasure_stderr": 0.0018713242691196597, "rougeLsum_precision": 0.3161706099613126, "rougeLsum_precision_stderr": 0.002502426291497884, "rougeLsum_recall": 0.36828376734500556, "rougeLsum_recall_stderr": 0.002374793122680808}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.050773439614255005, "bleu_stderr": 0.009607935815050534, "rouge1_fmeasure": 0.10525034871077296, "rouge1_fmeasure_stderr": 0.0014205929613240243, "rouge1_precision": 0.07457722326961545, "rouge1_precision_stderr": 0.0010813149665918967, "rouge1_recall": 0.18616767062678954, "rouge1_recall_stderr": 0.0023262601944921035, "rouge2_fmeasure": 0.005853296742054689, "rouge2_fmeasure_stderr": 0.0003595212542050643, "rouge2_precision": 0.004128841009305119, "rouge2_precision_stderr": 0.00025477847907952686, "rouge2_recall": 0.01051230796440712, "rouge2_recall_stderr": 0.0006602646260890872, "rougeL_fmeasure": 0.0912182400057594, "rougeL_fmeasure_stderr": 0.001142160193884147, "rougeL_precision": 0.06451608999455927, "rougeL_precision_stderr": 0.0008702064411682341, "rougeL_recall": 0.1622598214643354, "rougeL_recall_stderr": 0.0019407556202309798, "rougeLsum_fmeasure": 0.08935271982090909, "rougeLsum_fmeasure_stderr": 0.001186659250809151, "rougeLsum_precision": 0.06318245796876595, "rougeLsum_precision_stderr": 0.0008975507843696847, "rougeLsum_recall": 0.15913828904970537, "rougeLsum_recall_stderr": 0.002044934654600564}}, "1": {"article_DOC_summary": {"bleu": 0.37658580921002954, "bleu_stderr": 0.03383903342773015, "rouge1_fmeasure": 0.11065247004530705, "rouge1_fmeasure_stderr": 0.0017812843994385369, "rouge1_precision": 0.07879837485772763, "rouge1_precision_stderr": 0.0013599444284337506, "rouge1_recall": 0.19465714656847785, "rouge1_recall_stderr": 0.0029003189713371035, "rouge2_fmeasure": 0.009780488414863557, "rouge2_fmeasure_stderr": 0.0006426504271738838, "rouge2_precision": 0.006994168819790609, "rouge2_precision_stderr": 0.0004648272193447288, "rouge2_recall": 0.017050685879784068, "rouge2_recall_stderr": 0.0011197895634276233, "rougeL_fmeasure": 0.0932686816822836, "rougeL_fmeasure_stderr": 0.001351090628116566, "rougeL_precision": 0.06626180423614893, "rougeL_precision_stderr": 0.00103152044172589, "rougeL_recall": 0.16506949443493862, "rougeL_recall_stderr": 0.002243999574785469, "rougeLsum_fmeasure": 0.09068800328595211, "rougeLsum_fmeasure_stderr": 0.0014406209430729104, "rougeLsum_precision": 0.06443833252424745, "rougeLsum_precision_stderr": 0.0010900785870787669, "rougeLsum_recall": 0.1605294884501161, "rougeLsum_recall_stderr": 0.0024099921996914896}}, "2": {"article_DOC_summary": {"bleu": 0.5613289650024746, "bleu_stderr": 0.06954224370193056, "rouge1_fmeasure": 0.12480432119777549, "rouge1_fmeasure_stderr": 0.0021054789997765754, "rouge1_precision": 0.0889047742823774, "rouge1_precision_stderr": 0.0015736385663893536, "rouge1_recall": 0.21871321971864371, "rouge1_recall_stderr": 0.003489898707290529, "rouge2_fmeasure": 0.013892927856252131, "rouge2_fmeasure_stderr": 0.0009041385431726432, "rouge2_precision": 0.009892665457522843, "rouge2_precision_stderr": 0.0006521664121306877, "rouge2_recall": 0.02437366955189581, "rouge2_recall_stderr": 0.0015659961501778526, "rougeL_fmeasure": 0.1015360359782745, "rougeL_fmeasure_stderr": 0.0015523847061620607, "rougeL_precision": 0.07214973246105617, "rougeL_precision_stderr": 0.00116007366829089, "rougeL_recall": 0.1791872648503229, "rougeL_recall_stderr": 0.002631093758225792, "rougeLsum_fmeasure": 0.10235719035547435, "rougeLsum_fmeasure_stderr": 0.0017287630454853157, "rougeLsum_precision": 0.07278538171522661, "rougeLsum_precision_stderr": 0.001284146511191204, "rougeLsum_recall": 0.18033245428711908, "rougeLsum_recall_stderr": 0.0029284834426938845}}, "3": {"article_DOC_summary": {"bleu": 0.7079488064244481, "bleu_stderr": 0.07074366573560065, "rouge1_fmeasure": 0.12920118195381422, "rouge1_fmeasure_stderr": 0.002284952127981043, "rouge1_precision": 0.09397556603911704, "rouge1_precision_stderr": 0.0017994169570713602, "rouge1_recall": 0.22313205728599414, "rouge1_recall_stderr": 0.0039035927988892247, "rouge2_fmeasure": 0.017334182525631046, "rouge2_fmeasure_stderr": 0.0010127873838950666, "rouge2_precision": 0.01255758062915019, "rouge2_precision_stderr": 0.0007553728961037906, "rouge2_recall": 0.030710887496420355, "rouge2_recall_stderr": 0.0018415895742210585, "rougeL_fmeasure": 0.1039819933524293, "rougeL_fmeasure_stderr": 0.0017400868646152738, "rougeL_precision": 0.07567512644757472, "rougeL_precision_stderr": 0.0013958572399353046, "rougeL_recall": 0.18027261644595613, "rougeL_recall_stderr": 0.003034050646720842, "rougeLsum_fmeasure": 0.10658308430361232, "rougeLsum_fmeasure_stderr": 0.0018787907354694993, "rougeLsum_precision": 0.0775560956207995, "rougeLsum_precision_stderr": 0.0015015791755260772, "rougeLsum_recall": 0.1849635012704667, "rougeLsum_recall_stderr": 0.003277420997428528}}, "4": {"article_DOC_summary": {"bleu": 0.4082429912701259, "bleu_stderr": 0.0779765946109533, "rouge1_fmeasure": 0.03895580700792708, "rouge1_fmeasure_stderr": 0.002269413843476501, "rouge1_precision": 0.03264760362151805, "rouge1_precision_stderr": 0.0020833568971290787, "rouge1_recall": 0.061777608876693486, "rouge1_recall_stderr": 0.0037289394788052755, "rouge2_fmeasure": 0.0051666306699138615, "rouge2_fmeasure_stderr": 0.0005905785787950259, "rouge2_precision": 0.0038550154412843093, "rouge2_precision_stderr": 0.0004446008638460962, "rouge2_recall": 0.009089864083505584, "rouge2_recall_stderr": 0.0010911425483655028, "rougeL_fmeasure": 0.030269389323315635, "rougeL_fmeasure_stderr": 0.0017191936988352017, "rougeL_precision": 0.025565238785773294, "rougeL_precision_stderr": 0.0016418229878507925, "rougeL_recall": 0.04842391895975862, "rougeL_recall_stderr": 0.002904785956254577, "rougeLsum_fmeasure": 0.03182638094199837, "rougeLsum_fmeasure_stderr": 0.001849989660796298, "rougeLsum_precision": 0.026709322906345055, "rougeLsum_precision_stderr": 0.0017170791129193154, "rougeLsum_recall": 0.050929122969953576, "rougeLsum_recall_stderr": 0.0031086814353982915}}, "5": {"article_DOC_summary": {"bleu": 2.3068185785066287e-38, "bleu_stderr": 6.496274000069576e-33, "rouge1_fmeasure": 0.0021817427772383405, "rouge1_fmeasure_stderr": 0.0006550320885850304, "rouge1_precision": 0.002487457924399334, "rouge1_precision_stderr": 0.0007733944656593977, "rouge1_recall": 0.0019894770885392197, "rouge1_recall_stderr": 0.000580222680643798, "rouge2_fmeasure": 0.00026626492060156015, "rouge2_fmeasure_stderr": 0.00019473030694105065, "rouge2_precision": 0.0003335239184295789, "rouge2_precision_stderr": 0.0002474907080024258, "rouge2_recall": 0.00022186932564291056, "rouge2_recall_stderr": 0.00016062217222749233, "rougeL_fmeasure": 0.00163204036870977, "rougeL_fmeasure_stderr": 0.00048037762637480547, "rougeL_precision": 0.0018641694287175915, "rougeL_precision_stderr": 0.0005696317403990395, "rougeL_recall": 0.001490936334422112, "rougeL_recall_stderr": 0.0004262092207690586, "rougeLsum_fmeasure": 0.0016483540386329559, "rougeLsum_fmeasure_stderr": 0.0004762553014171529, "rougeLsum_precision": 0.0018726329116100717, "rougeLsum_precision_stderr": 0.0005610053066563542, "rougeLsum_recall": 0.0015127742100335723, "rougeLsum_recall_stderr": 0.00042560130622958597}}}} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..93450c681e0c63f4fec0b34e0f907139ffefabac --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.38932054005592026, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.033555755060432874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07700762746532616, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017574979074395903 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32659511083930237, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004731650619590043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11567054917865605, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020244917898064024 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03638446401446654, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010787709184703832 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16083759865432445, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003301779480848987 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05486406523084765, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001296868906653324 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07349742960861025, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015742317195240577 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3166189013286804, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004624777728623184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1109504406851204, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018678472410080015 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07279482633328832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016761377220670234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3093845485351636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0044227058124167145 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10923439857735848, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018915212851263181 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e5abbec272c6a090e89c89674f3e07b35557f8f9 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4010897789440996, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.035896717451801716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07680237479305678, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015098564819853281 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33534169873311565, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004781931847971189 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11699641676828608, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019738238820560176 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03628266659322727, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000933027012126572 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16628641951511963, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033828151468031586 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.055756584513908414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001272076817653578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07343722417982688, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013890074049552911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3242524897206685, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004652592170560419 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11218413128284264, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018370252279043208 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07304381756570862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014318516789194186 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31848926688825263, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004464167390774732 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11118263806943471, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00186587403392927 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..fc42ba053ca337de53458933270bb5b7be4cb608 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.42050495009420186, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.032401125485335705 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07641858629031524, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001512634713162659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33877843978124605, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004754821881470426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11702064522926466, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019445019569898875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.036264403146498855, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009035954944938369 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1690812614534248, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003434599850442417 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05594462621018749, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012491415665991895 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07283896117332733, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013782370339395086 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3264281887874627, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004614214472784931 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11189801577003984, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018101065557781222 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07258450279456244, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001408393730596637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3215563384939613, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004418413263595925 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11116640622510127, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018282586995381895 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..497aa7cf6fa91bd94573249faa9947098a809e62 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4159282397373365, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.028655242365073413 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07708995134838893, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016510440818257972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3354474085870324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00468185355577616 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11669978392247876, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001947629137421236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.036292243594643316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009640743250760637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1671806366343255, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033800199116181236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05573726170555429, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001260032003769061 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07334583374730402, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015569789464435146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3219883012710275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004531388461994007 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11124128735569962, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018188725421961862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0731288278640945, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001584414887403581 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3175376346502283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004360960463880489 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1105408760944851, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018355827239676303 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9c7b055e20a55014de0810fd7b7be0df7a111d3f --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.45977953222907975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.040383389212127606 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07912418599977926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016198563555509216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3469232249585729, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004748846708374396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12037862744280613, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019934979793417573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03786394920640546, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009914452911534558 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1738267805541333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0034485131641693997 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.057885613023243146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012856906291534122 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07493423869370922, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001498740213643017 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3315897347595549, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004565989220241603 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11423756257112685, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001835936651182981 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07489164236007877, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015400968477694266 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.32772172422348506, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0044015503137291995 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11380829873927818, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001868863082530539 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..858e0bfbff3bffcfc7cec6d0bc148cab52979ff3 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4100101691229707, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.038169326985519704 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0761749476924903, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015866409368917428 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33694404520159127, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004725880718920406 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11616969508673808, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001954781890870284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03627352196162682, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000977188491579374 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.167813784642509, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033985471522842725 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.055555746394993916, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012524254498175414 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07259356194973668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014842070922326297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3238850222294547, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.00455273082293743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11092231794095461, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018208627331043194 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07239284886715931, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015076194366894973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3201817418126636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004401296672435449 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11030205012975042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018300168280422182 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17e81424c58ddf74c5380c5981beb35b21138e07 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.09250196886098348, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012387826019604092 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.12905376314562253, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013091825113428792 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.09998460503837092, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010700519248261517 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.004542792640061134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00021902406054084622 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.005267988260289562, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0002641606109254951 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004548025347793951, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00021301188784611375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.0806751359892451, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000988255429146176 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.11638678461084004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011920958367752498 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0884294786747584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008862698881090451 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.08931760733909207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011977856331501172 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.12471118748837833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012621097866983477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.09653806771590727, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010284292205864202 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.06823499187294975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02672839530839174 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..c1801d919d1193c023c8ef5e3776dc6c47c6c1cd --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.10227535255850072, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015211441148919019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.1666238343075504, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002113191459192441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1176727050001766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015061256733290465 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.010133390687879478, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00045221775358988313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.017278009943298343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008307221313804175 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.011758190082089873, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005161253570713452 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.0840035133484757, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00108582843746046 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1409822252220471, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016244906370306395 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0977465022371193, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010778296010975216 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.09563245107317732, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014144569423562025 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.15636027194288163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001975628206486446 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.11014610167130383, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0014009647264510289 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.627587943269457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03752664558072816 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..aa335df843e4a20abd47383c0b44ab5713b85bac --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.13533522102182108, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018951373520473378 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22856932435608124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027080432167508025 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.15805650713785413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019004938271445784 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.023543013388107385, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000717310361867752 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.040832806012490945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001298377746871254 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.02760738077834664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008049340257416579 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.10293794994726356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012437549972814086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1802214254693049, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020107380760737443 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12169470238796481, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012563483854001994 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.12611401083224677, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017609062395669717 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21348564535890185, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025299144255695695 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.14735538469349385, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017649380102418163 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.3448790104080863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06809231563626762 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d92a4c602d670439db609f445eb90b91d94f5954 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.12776173884849967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002166703255999288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2075725639546676, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0030329813736934214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.14320563335091396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020698343768995294 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.022636857885145915, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007440116219601286 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0397256121665061, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001322773196274425 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.026283043574202986, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008159202511212215 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.09657179764839124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015732121770481863 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.16159908811630536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023213429955332913 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.10883270044311108, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014339155970440763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.11866324620879036, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020254088359404664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.19292621911672542, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002834333227053115 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.13283753381327199, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019146305178959213 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.528039320029562, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.036905573594762583 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ca01d16368bf2474b3bb74afe469ae16c3716f39 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.04598706351137218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018221188645625001 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.0739807986804339, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002646171903509083 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.04899842107573778, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017104674234628445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.008841756168682364, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006116495968030462 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.015468257674774097, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0009624564354622926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.009690319746932921, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0005414022669991509 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.03612427764892416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014536883711277432 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05940229969748619, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002132196101860115 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.03836996592095654, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012956784650356502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.04261326941140918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017129168307166089 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06810975113263097, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024407540230673218 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.04519059371598313, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001579759147906247 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.37223171728306426, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03014516594133762 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c16b8a2922c7191bbee309053e2e3e38c28dba3c --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.007551294746242374, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008016066090893644 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.01175212455089267, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011644529839797179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.007820849054110231, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007600843285544493 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0014154766947717414, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00021722050245300052 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.002696191748609163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004898145017501464 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.001550055750323692, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002154946138593672 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.005847845224705761, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000595323185000189 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.009647797063887112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0009660140204759589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006164411990577186, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0005772505558321714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.006936670272296256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007360722546012934 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010824451468983412, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010820316072561469 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.007179693639002067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0006977167152892271 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 5.013143696035085e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.50150744136576e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..deefde80f9a7175adaf7014199526255132c551d --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.09084719792363964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.023524182784253476 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.013115055244320387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0003168780725030529 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.023423577684344953, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0005821108276955901 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.016457051237123384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0003925090249226089 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.0003338211382113822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 8.920509387302503e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.0005083527300182993, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.00013811375079865392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.0003989196235540006, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00010791151592722659 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.013115055244320387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0003168780725030529 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.023423577684344953, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0005821108276955901 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.016457051237123384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0003925090249226089 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.012659903516962778, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00029850433041728446 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.0226118934358276, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0005511022657739085 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.015885414699151633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00036971385737970014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4a6e59ecfba56334cff0ff8df314859c35ee2c7d --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.493656512933155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06507341585485718 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3973818846475776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0026976072721873632 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4223117985183209, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002783358808801643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.38211667792792864, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002042530024715172 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.16992657792281268, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019145879314252548 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.178124519625553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018587557359939306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.16014559457551358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014918368862659674 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2784440754511285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002173881277475784 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2960008580013917, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.00223721376663644 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2656366947575347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00154921779515304 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.33386523672528523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002495920334450439 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3524456027280379, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00250316491749807 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3191928781032716, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018752599773732524 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6902a57313083a78e6ca90a1bfb4103b3c4d2d59 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.70861649242996, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.180728212256743 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4246829342811345, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002766857681080391 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.42902360153249547, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026637334370122148 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4028985936075782, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019913008996638617 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.188091439420101, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019527339746865022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18884241914729302, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001866659004782858 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17685301250165733, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015810873135005685 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2982294118028583, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0022177478762698505 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3014772250776991, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021343172062020443 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28212495177649627, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016022344134082288 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3532941586993858, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002547817337684264 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3559974321347138, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024336261264831867 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.33457230441540664, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018977411692316212 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ee60d711ee411d6f85c4356c617da5957627eba6 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.966588735110538, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12599090386040324 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.42159026565077223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002651117861201503 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.436277297035412, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026416756972084655 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4068505832141505, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019329494846097367 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.18776315928969667, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018907826120837835 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19380136137444595, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019180421049331063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17997150688916153, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001592406273022251 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2954653149171501, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002167984706061261 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.30557723457112207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021619718205346107 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28424346982191884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016109414932517093 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3525432765457679, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0024285145519395723 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3651612369599634, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024667743399167472 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3401366732280647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018580545934390128 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ac8007c412e33bd9778818408d0df9e20f169386 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.251162519243813, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16043314004667306 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.40242023070868665, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0027480602148687965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.436755013677424, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002566467851228464 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3969692723001995, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019731818045355485 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.176937710453779, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019212015420408378 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19058682088957968, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018838368515368652 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17289054467515227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00158925330122968 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2782288135889908, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002208420858744797 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.30117513507531163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002108417735667974 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2734957726548441, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016372934907566048 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3343952904639146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0024871514512787504 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36379303588555467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024348682831782293 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.33022165481719784, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001911613908677662 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..94aa61f2a110c1e259164ea27c03476630207b8e --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.324834183472209, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12201022191511865 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3770667876954104, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002761667549229222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4384978042745568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002504170522982392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3826658470581734, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019397433080247473 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.16298069992158787, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001917959278145272 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18727836348007232, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018506477395530997 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1633047480034024, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015591168107894247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.263196349500877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002234840925291514 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.30529924132930725, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0021085926401520298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2662588358520316, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016519997409068458 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3161706099613126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002502426291497884 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36828376734500556, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002374793122680808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3210382724978914, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018713242691196597 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..486d4e45595d34c2b255af96ea6b6038fc8cf3b9 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07457722326961545, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0010813149665918967 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.18616767062678954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0023262601944921035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.10525034871077296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0014205929613240243 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.004128841009305119, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00025477847907952686 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01051230796440712, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0006602646260890872 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.005853296742054689, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003595212542050643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06451608999455927, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0008702064411682341 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.1622598214643354, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0019407556202309798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0912182400057594, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001142160193884147 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.06318245796876595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008975507843696847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.15913828904970537, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.002044934654600564 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.08935271982090909, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001186659250809151 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.050773439614255005, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.009607935815050534 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9255829323b2010fcacb74344ea712f086f0d4af --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07879837485772763, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0013599444284337506 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.19465714656847785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0029003189713371035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.11065247004530705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0017812843994385369 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.006994168819790609, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0004648272193447288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.017050685879784068, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0011197895634276233 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.009780488414863557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0006426504271738838 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06626180423614893, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00103152044172589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.16506949443493862, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002243999574785469 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0932686816822836, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001351090628116566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.06443833252424745, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0010900785870787669 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.1605294884501161, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0024099921996914896 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.09068800328595211, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0014406209430729104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.37658580921002954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.03383903342773015 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..70da9a0712c6ca5d9dfa42e9ea4210c504f01454 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0889047742823774, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0015736385663893536 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.21871321971864371, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003489898707290529 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.12480432119777549, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0021054789997765754 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.009892665457522843, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0006521664121306877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.02437366955189581, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0015659961501778526 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.013892927856252131, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009041385431726432 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.07214973246105617, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00116007366829089 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.1791872648503229, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002631093758225792 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1015360359782745, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0015523847061620607 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.07278538171522661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001284146511191204 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.18033245428711908, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0029284834426938845 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.10235719035547435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0017287630454853157 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.5613289650024746, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06954224370193056 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..449ab70fff6e24e56b4aac36c8215271b3fab888 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.09397556603911704, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017994169570713602 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.22313205728599414, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0039035927988892247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.12920118195381422, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002284952127981043 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.01255758062915019, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007553728961037906 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.030710887496420355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018415895742210585 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.017334182525631046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010127873838950666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.07567512644757472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013958572399353046 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.18027261644595613, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003034050646720842 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1039819933524293, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017400868646152738 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0775560956207995, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015015791755260772 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.1849635012704667, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003277420997428528 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.10658308430361232, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0018787907354694993 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.7079488064244481, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07074366573560065 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4bb7991c374680493aec01d4bf341ec72a9bac95 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.03264760362151805, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0020833568971290787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.061777608876693486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0037289394788052755 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.03895580700792708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002269413843476501 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0038550154412843093, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0004446008638460962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.009089864083505584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0010911425483655028 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0051666306699138615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0005905785787950259 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.025565238785773294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016418229878507925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04842391895975862, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.002904785956254577 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.030269389323315635, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0017191936988352017 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.026709322906345055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017170791129193154 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.050929122969953576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0031086814353982915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.03182638094199837, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001849989660796298 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.4082429912701259, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0779765946109533 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..01e4f13b571adbde59f6c1ab33139cc9c580bfdd --- /dev/null +++ b/4b284b12bc4seed3/evaluation/generation/slim.4b284b12bc4seed3_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002487457924399334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0007733944656593977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0019894770885392197, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.000580222680643798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0021817427772383405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006550320885850304 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0003335239184295789, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002474907080024258 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00022186932564291056, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00016062217222749233 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00026626492060156015, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00019473030694105065 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0018641694287175915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005696317403990395 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.001490936334422112, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004262092207690586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.00163204036870977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00048037762637480547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0018726329116100717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005610053066563542 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0015127742100335723, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00042560130622958597 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0016483540386329559, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0004762553014171529 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.3068185785066287e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.496274000069576e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..96f099a58388801c0bb5af921477e940b7a86159 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229868,0 +anli_r2,acc,0.336,0.014944140233795027,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.2619453924914676,0.012849054826858114,0 +arc_challenge,acc_norm,0.295221843003413,0.01332975029338232,0 +arc_easy,acc,0.569023569023569,0.010161552863493758,0 +arc_easy,acc_norm,0.5071548821548821,0.01025873302244637,0 +boolq,acc,0.617737003058104,0.008499149690449282,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.21956970232832299,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4629555865365465,0.0049760677264325615,0 +hellaswag,acc_norm,0.6056562437761402,0.004877104939356237,0 +piqa,acc,0.7442872687704026,0.010178690109459857,0 +piqa,acc_norm,0.7475516866158868,0.010135665547362354,0 +rte,acc,0.5848375451263538,0.02966006629089349,0 +sciq,acc,0.83,0.01188449583454167,0 +sciq,acc_norm,0.734,0.013979965645145158,0 +storycloze_2016,acc,0.7097808658471406,0.010495529690730063,0 +winogrande,acc,0.5832675611681136,0.01385625007279632,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0_lm-eval_global_step80108_2023-02-24-15-37-26_0shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0_lm-eval_global_step80108_2023-02-24-15-37-26_0shots_backup.json deleted file mode 100644 index 9b634089e9d229284a49bb8d63491f8a78bdd095..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_0_lm-eval_global_step80108_2023-02-24-15-37-26_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229868 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795027 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.21956970232832299 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4629555865365465, - "acc_stderr": 0.0049760677264325615, - "acc_norm": 0.6056562437761402, - "acc_norm_stderr": 0.004877104939356237 - }, - "rte": { - "acc": 0.5848375451263538, - "acc_stderr": 0.02966006629089349 - }, - "winogrande": { - "acc": 0.5832675611681136, - "acc_stderr": 0.01385625007279632 - }, - "storycloze_2016": { - "acc": 0.7097808658471406, - "acc_stderr": 0.010495529690730063 - }, - "boolq": { - "acc": 0.617737003058104, - "acc_stderr": 0.008499149690449282 - }, - "arc_easy": { - "acc": 0.569023569023569, - "acc_stderr": 0.010161552863493758, - "acc_norm": 0.5071548821548821, - "acc_norm_stderr": 0.01025873302244637 - }, - "arc_challenge": { - "acc": 0.2619453924914676, - "acc_stderr": 0.012849054826858114, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.01332975029338232 - }, - "sciq": { - "acc": 0.83, - "acc_stderr": 0.01188449583454167, - "acc_norm": 0.734, - "acc_norm_stderr": 0.013979965645145158 - }, - "piqa": { - "acc": 0.7442872687704026, - "acc_stderr": 0.010178690109459857, - "acc_norm": 0.7475516866158868, - "acc_norm_stderr": 0.010135665547362354 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..8ac04334946785bd0b3e11622f77851785925880 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411242,0 +anli_r2,acc,0.338,0.014965960710224487,0 +anli_r3,acc,0.3333333333333333,0.0136139500102256,0 +arc_challenge,acc,0.26706484641638223,0.01292893319649636,0 +arc_challenge,acc_norm,0.30631399317406144,0.013470584417276513,0 +arc_easy,acc,0.5892255892255892,0.01009510134934865,0 +arc_easy,acc_norm,0.5361952861952862,0.01023286555034674,0 +boolq,acc,0.6186544342507645,0.008495245917063564,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.34164884770729387,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.46634136626170086,0.00497846269096693,0 +hellaswag,acc_norm,0.6101374228241386,0.004867221634461266,0 +piqa,acc,0.7459194776931447,0.010157271999135041,0 +piqa,acc_norm,0.7524483133841132,0.010069703966857114,0 +rte,acc,0.5776173285198556,0.029731622646495887,0 +sciq,acc,0.847,0.011389500459665532,0 +sciq,acc_norm,0.777,0.013169830843425673,0 +storycloze_2016,acc,0.7167290219134153,0.010419760409155363,0 +winogrande,acc,0.5895816890292028,0.013825107120035861,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1_lm-eval_global_step80108_2023-02-24-15-37-26_1shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1_lm-eval_global_step80108_2023-02-24-15-37-26_1shots_backup.json deleted file mode 100644 index 0343d7498b5b8d3a367ef637b8104c5b44d17878..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_1_lm-eval_global_step80108_2023-02-24-15-37-26_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411242 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224487 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.0136139500102256 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.34164884770729387 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.46634136626170086, - "acc_stderr": 0.00497846269096693, - "acc_norm": 0.6101374228241386, - "acc_norm_stderr": 0.004867221634461266 - }, - "rte": { - "acc": 0.5776173285198556, - "acc_stderr": 0.029731622646495887 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.013825107120035861 - }, - "storycloze_2016": { - "acc": 0.7167290219134153, - "acc_stderr": 0.010419760409155363 - }, - "boolq": { - "acc": 0.6186544342507645, - "acc_stderr": 0.008495245917063564 - }, - "arc_easy": { - "acc": 0.5892255892255892, - "acc_stderr": 0.01009510134934865, - "acc_norm": 0.5361952861952862, - "acc_norm_stderr": 0.01023286555034674 - }, - "arc_challenge": { - "acc": 0.26706484641638223, - "acc_stderr": 0.01292893319649636, - "acc_norm": 0.30631399317406144, - "acc_norm_stderr": 0.013470584417276513 - }, - "sciq": { - "acc": 0.847, - "acc_stderr": 0.011389500459665532, - "acc_norm": 0.777, - "acc_norm_stderr": 0.013169830843425673 - }, - "piqa": { - "acc": 0.7459194776931447, - "acc_stderr": 0.010157271999135041, - "acc_norm": 0.7524483133841132, - "acc_norm_stderr": 0.010069703966857114 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..c3c2b03bfd17b40393f6e8d5727140f3cf3bc926 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411245,0 +anli_r2,acc,0.322,0.014782913600996664,0 +anli_r3,acc,0.345,0.013728421539454878,0 +arc_challenge,acc,0.2636518771331058,0.012875929151297046,0 +arc_challenge,acc_norm,0.3046075085324232,0.013449522109932489,0 +arc_easy,acc,0.5951178451178452,0.010072423960395703,0 +arc_easy,acc_norm,0.5484006734006734,0.010211600726405232,0 +boolq,acc,0.6119266055045871,0.008523130584760836,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.2921212121212121,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4651463851822346,0.004977643730848592,0 +hellaswag,acc_norm,0.6108344951204939,0.004865645485910439,0 +piqa,acc,0.750816104461371,0.010091882770120216,0 +piqa,acc_norm,0.7562568008705114,0.010017199471500609,0 +rte,acc,0.5812274368231047,0.029696661081234834,0 +sciq,acc,0.854,0.011171786285496501,0 +sciq,acc_norm,0.787,0.012953717566737228,0 +storycloze_2016,acc,0.7194013896312133,0.01038980964728882,0 +winogrande,acc,0.5966850828729282,0.013787257285896248,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2_lm-eval_global_step80108_2023-02-24-15-37-25_2shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2_lm-eval_global_step80108_2023-02-24-15-37-25_2shots_backup.json deleted file mode 100644 index ecd254be2fc017c7fc618687b75a484e28dc571d..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_2_lm-eval_global_step80108_2023-02-24-15-37-25_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411245 - }, - "anli_r2": { - "acc": 0.322, - "acc_stderr": 0.014782913600996664 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.013728421539454878 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.2921212121212121 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4651463851822346, - "acc_stderr": 0.004977643730848592, - "acc_norm": 0.6108344951204939, - "acc_norm_stderr": 0.004865645485910439 - }, - "rte": { - "acc": 0.5812274368231047, - "acc_stderr": 0.029696661081234834 - }, - "winogrande": { - "acc": 0.5966850828729282, - "acc_stderr": 0.013787257285896248 - }, - "storycloze_2016": { - "acc": 0.7194013896312133, - "acc_stderr": 0.01038980964728882 - }, - "boolq": { - "acc": 0.6119266055045871, - "acc_stderr": 0.008523130584760836 - }, - "arc_easy": { - "acc": 0.5951178451178452, - "acc_stderr": 0.010072423960395703, - "acc_norm": 0.5484006734006734, - "acc_norm_stderr": 0.010211600726405232 - }, - "arc_challenge": { - "acc": 0.2636518771331058, - "acc_stderr": 0.012875929151297046, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.013449522109932489 - }, - "sciq": { - "acc": 0.854, - "acc_stderr": 0.011171786285496501, - "acc_norm": 0.787, - "acc_norm_stderr": 0.012953717566737228 - }, - "piqa": { - "acc": 0.750816104461371, - "acc_stderr": 0.010091882770120216, - "acc_norm": 0.7562568008705114, - "acc_norm_stderr": 0.010017199471500609 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..6b216f0f45834db4dba7b86e2f3601f6a648dcca --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932577,0 +anli_r2,acc,0.337,0.01495508791865361,0 +anli_r3,acc,0.35083333333333333,0.013782212417178199,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0 +arc_easy,acc,0.5951178451178452,0.010072423960395701,0 +arc_easy,acc_norm,0.5585016835016835,0.010189314382749934,0 +boolq,acc,0.6134556574923548,0.008516943934341978,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.28810120539443845,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4658434574785899,0.004978124945759844,0 +hellaswag,acc_norm,0.6102370045807608,0.0048669971103881965,0 +piqa,acc,0.7459194776931447,0.010157271999135041,0 +piqa,acc_norm,0.7595212187159956,0.009971345364651068,0 +rte,acc,0.5776173285198556,0.029731622646495887,0 +sciq,acc,0.855,0.011139977517890134,0 +sciq,acc_norm,0.794,0.012795613612786548,0 +storycloze_2016,acc,0.729021913415286,0.010278188399635051,0 +winogrande,acc,0.5935280189423836,0.013804448697753376,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3_lm-eval_global_step80108_2023-02-24-15-37-25_3shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3_lm-eval_global_step80108_2023-02-24-15-37-25_3shots_backup.json deleted file mode 100644 index 819953f6a017d62d23a751e677edc33f0da76514..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_3_lm-eval_global_step80108_2023-02-24-15-37-25_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932577 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.01495508791865361 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178199 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.28810120539443845 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4658434574785899, - "acc_stderr": 0.004978124945759844, - "acc_norm": 0.6102370045807608, - "acc_norm_stderr": 0.0048669971103881965 - }, - "rte": { - "acc": 0.5776173285198556, - "acc_stderr": 0.029731622646495887 - }, - "winogrande": { - "acc": 0.5935280189423836, - "acc_stderr": 0.013804448697753376 - }, - "storycloze_2016": { - "acc": 0.729021913415286, - "acc_stderr": 0.010278188399635051 - }, - "boolq": { - "acc": 0.6134556574923548, - "acc_stderr": 0.008516943934341978 - }, - "arc_easy": { - "acc": 0.5951178451178452, - "acc_stderr": 0.010072423960395701, - "acc_norm": 0.5585016835016835, - "acc_norm_stderr": 0.010189314382749934 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.01359243151906808 - }, - "sciq": { - "acc": 0.855, - "acc_stderr": 0.011139977517890134, - "acc_norm": 0.794, - "acc_norm_stderr": 0.012795613612786548 - }, - "piqa": { - "acc": 0.7459194776931447, - "acc_stderr": 0.010157271999135041, - "acc_norm": 0.7595212187159956, - "acc_norm_stderr": 0.009971345364651068 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..5fddd41fb5efff55ab00a5f450296d33053a26af --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316407,0 +anli_r2,acc,0.333,0.014910846164229859,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2636518771331058,0.012875929151297046,0 +arc_challenge,acc_norm,0.3046075085324232,0.01344952210993249,0 +arc_easy,acc,0.5968013468013468,0.0100656685767948,0 +arc_easy,acc_norm,0.5555555555555556,0.01019625483869168,0 +boolq,acc,0.6174311926605505,0.008500443818876161,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.30142857142857143,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4660426209918343,0.004978260641742204,0 +hellaswag,acc_norm,0.6097390957976498,0.00486811759848194,0 +piqa,acc,0.7453754080522307,0.01016443223706048,0 +piqa,acc_norm,0.7551686615886833,0.0100323091055688,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.841,0.01156947936827129,0 +sciq,acc_norm,0.798,0.012702651587655137,0 +storycloze_2016,acc,0.7177979690005345,0.010407834479647673,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json deleted file mode 100644 index 715dbfd027615ce5cd12c24313a6fc18763cf7bc..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316407 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229859 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.30142857142857143 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4660426209918343, - "acc_stderr": 0.004978260641742204, - "acc_norm": 0.6097390957976498, - "acc_norm_stderr": 0.00486811759848194 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647673 - }, - "boolq": { - "acc": 0.6174311926605505, - "acc_stderr": 0.008500443818876161 - }, - "arc_easy": { - "acc": 0.5968013468013468, - "acc_stderr": 0.0100656685767948, - "acc_norm": 0.5555555555555556, - "acc_norm_stderr": 0.01019625483869168 - }, - "arc_challenge": { - "acc": 0.2636518771331058, - "acc_stderr": 0.012875929151297046, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.01344952210993249 - }, - "sciq": { - "acc": 0.841, - "acc_stderr": 0.01156947936827129, - "acc_norm": 0.798, - "acc_norm_stderr": 0.012702651587655137 - }, - "piqa": { - "acc": 0.7453754080522307, - "acc_stderr": 0.01016443223706048, - "acc_norm": 0.7551686615886833, - "acc_norm_stderr": 0.0100323091055688 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5.csv b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..1a841fda0cb56b2a2685b7fa1cd56090b53f5c67 --- /dev/null +++ b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932573,0 +anli_r2,acc,0.348,0.01507060460376841,0 +anli_r3,acc,0.3616666666666667,0.01387613166312387,0 +arc_challenge,acc,0.2781569965870307,0.013094469919538807,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623501,0 +arc_easy,acc,0.5909090909090909,0.01008877515261579,0 +arc_easy,acc_norm,0.5580808080808081,0.010190328123071768,0 +boolq,acc,0.6168195718654435,0.008503021391450783,1 +cb,acc,0.44642857142857145,0.06703189227942397,1 +cb,f1,0.3113026819923372,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.465345548695479,0.004977782217582457,0 +hellaswag,acc_norm,0.6089424417446724,0.004869899297734556,0 +piqa,acc,0.7464635473340587,0.010150090834551786,0 +piqa,acc_norm,0.7540805223068553,0.010047331865625184,0 +rte,acc,0.555956678700361,0.029907396333795994,0 +sciq,acc,0.845,0.01145015747079947,0 +sciq,acc_norm,0.801,0.012631649083099182,0 +storycloze_2016,acc,0.7199358631747729,0.010383764993920483,0 +winogrande,acc,0.5872138910812944,0.013837060648682101,0 diff --git a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5_lm-eval_global_step80108_2023-02-24-15-37-26_5shots_backup.json b/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5_lm-eval_global_step80108_2023-02-24-15-37-26_5shots_backup.json deleted file mode 100644 index 7de8ac65141a7458060f8ed9313ff177a24af65a..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed3/evaluation/rankeval/4b284b12bc4seed3_5_lm-eval_global_step80108_2023-02-24-15-37-26_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r2": { - "acc": 0.348, - "acc_stderr": 0.01507060460376841 - }, - "anli_r3": { - "acc": 0.3616666666666667, - "acc_stderr": 0.01387613166312387 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942397, - "f1": 0.3113026819923372 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.465345548695479, - "acc_stderr": 0.004977782217582457, - "acc_norm": 0.6089424417446724, - "acc_norm_stderr": 0.004869899297734556 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795994 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.013837060648682101 - }, - "storycloze_2016": { - "acc": 0.7199358631747729, - "acc_stderr": 0.010383764993920483 - }, - "boolq": { - "acc": 0.6168195718654435, - "acc_stderr": 0.008503021391450783 - }, - "arc_easy": { - "acc": 0.5909090909090909, - "acc_stderr": 0.01008877515261579, - "acc_norm": 0.5580808080808081, - "acc_norm_stderr": 0.010190328123071768 - }, - "arc_challenge": { - "acc": 0.2781569965870307, - "acc_stderr": 0.013094469919538807, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623501 - }, - "sciq": { - "acc": 0.845, - "acc_stderr": 0.01145015747079947, - "acc_norm": 0.801, - "acc_norm_stderr": 0.012631649083099182 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551786, - "acc_norm": 0.7540805223068553, - "acc_norm_stderr": 0.010047331865625184 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..598b596b8f9d0fe26dca37b24e70552c002a5684 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4255419689866024, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04553671379431864}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07691162970808266, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015499735187471276}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3361349423128226, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004717900225751208}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11709985442438758, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019235811070200824}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03634094203430491, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010568736203451017}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16382279258438331, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032641856127605708}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05502825027290272, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012188176771805944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07347783656701605, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014400832686900517}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.325262737431151, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046026209075170806}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11222201441152387, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017898759559440783}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07247687306153981, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014666193171867594}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.31667827309434965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004394255465374989}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1102576752933881, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017934408668905857}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..66720cd026bb64bb974dac08c8633747e3c54f54 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4375148581871164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.043362378909165875}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.084573986124589, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022156458659289445}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32974434511651185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004760360381005011}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.12097522483010485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002202684090770471}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.040740790020171655, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014239356080917533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16249675831000945, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003343995882368083}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05811058513302895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014298876541859705}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07979616615847458, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0020050186616118407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31708796648649956, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004617552099836135}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11484527988661357, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0020020380133125328}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07967262232949184, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002074014298894367}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3110596157987853, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004435869800526469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11394268478061556, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020477402089164776}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..34a1e839cf3b7ebc4324635fc664a625c554a273 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.43895421656508316, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03560314454345476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07893892499245164, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017559781579577936}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33147399394577687, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004792853072557146}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11837525964996032, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020736790107389433}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03778130469895934, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011220289463223655}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16607823925825196, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003428415308098998}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05685211598675336, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0013242460972892624}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0751384880267446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0016250699836019358}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31955459951422993, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004633747457550051}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11305526526705706, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0019076101565052952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07484212840598793, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0016673128268097014}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.31478623106918857, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004462671481456754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11212261864528106, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019332204600801869}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ee16ff28a5949cfb1427cdd0eed20a8841adfe84 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4055173976621406, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03388942315369854}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07564013742377862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001624130926371493}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32832027029775396, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004754946563417434}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11521270106633405, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020223822680663353}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03600203329160531, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0010611943954759993}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16248350820338053, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003392520543251839}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.054818451753863656, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012891162497869618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07201677299268112, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015006281609086204}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31619306971917027, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004595788592278942}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.11001628031350326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018702505486722842}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07139551984673621, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001521052983920996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3106722185590567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004410951996082152}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10876033036378392, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00187477057348602}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3fd2626c0fa2e2f664a69faeaf5b67239a0ae03d --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4219802204390914, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.035062189637818944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07604999251191036, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001694487867306116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.32739311904589957, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004733990565227727}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11435782016646073, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019930832962392476}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03608787195276645, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009682458617317419}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16298585196913531, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0033617720441310407}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05493385212823661, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012747653539780157}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.0719402479836372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015114912843032233}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3140568060068406, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004584266596245509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1087646065353738, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018365048381652636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.0717813619994629, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015531796251007297}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30959297417269843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004389147529346589}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.10814461597064734, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018690945883989335}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..0b43ecdf772c7f8107206b24810a2d3b20b5db45 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.41421236223783364, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03144881485668538}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07376815882972376, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015413873275619697}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3269107430349855, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004768390550563892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11235776546508958, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019443755895376892}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03507239155053733, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.000919535201162236}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16374442513917464, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003472789622253923}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05370114479085305, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012337413746734015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06977608287770509, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001374956676465801}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.31368077194090394, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004624757562111944}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10679593308879384, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017931240322613966}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06944197659188452, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001414266115795263}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.30880609903141404, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004454929611901481}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.1058735384538628, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001807570321110346}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f6b00fa4e6afb47fc9df6d6adf014deea3afcb32 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.06498870832691263, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010723261619156347}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.09208329086891458, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0014107679511711905}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.070437989859743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010312902668252013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0031259330192821917, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001792020567470567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0041562592064827136, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0002654234746028053}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0032754676506289828, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00018463100249520333}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.058599346602679785, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008924974727175551}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.08428848354699885, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001258671638105778}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0638808755302006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008731247333910541}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.06184900174439773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001011647200269575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.0878197954616511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013420548769352185}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.06702516127871343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009693030073720331}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.0489790785966099, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.006779339030578523}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8907915243c451fbf52d704b500c580c2459aa81 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07527407433133869, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012353167292243439}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.11879070038037605, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001666703743793988}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.08534345009310793, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012180416276689969}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.0037334174812980337, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00022288691443581962}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.005917125245253962, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0003984864275118756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004199685382427501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002477120467452383}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.06809342794848382, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010182876153051106}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11003512325420707, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0014770539784070803}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0779496715247138, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00102035551714523}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07054606286211522, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001139963924498013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.11204758740190628, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0015572733289150008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.08013661978467537, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011236586288798575}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.19435530388407438, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.027474135486464886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..54c83109930be5f68a29b1a163ace87935cd5a67 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.08188916742697006, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0012604740470487253}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.12863501946937672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0017306118753458995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.09264326558315315, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012341782878536826}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.00417689519312021, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00027799524088823336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0070394381668830155, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.000549533007866243}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.004799135265562465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003258383358742087}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07469229492018686, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0010462849454463273}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11996588099996168, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0015431639618236344}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.08530978440523501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010419970840251892}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07637785471073269, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011637595201347114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.12067856570386956, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016119906969829974}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.08652135177894667, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011347677035999774}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.26166447349576716, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02664531207793444}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..80eec76d51f6c27f1ca87e3649db239b7e5b33de --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.07918559281464797, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015732102907079421}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.11713311231879246, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0018694365480156567}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.08496018846964883, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013534144382857925}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.005450855204647191, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042451344437272563}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.00786801734624306, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005418684959080103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.005617198716065193, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003504250975971709}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.07101336043004314, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001347924853763445}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.10774352434928229, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016488819148997246}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.07686453493425864, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011252346058277117}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.07334057105853452, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00146993475075726}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.108988272664584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0017377582372509691}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.07875700339703082, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012486260433925756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.35676801434287403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04322442294335073}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..7674c14da1b0a5179f4979c8eebcd06c61bb7eed --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.030329065224279812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001330278405910414}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.04517297214250825, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001758818657887946}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.03139283202672829, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0012003402420825678}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002783763831604278, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003242150810229513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0044964737985171525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005138289413575867}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0029260675465866874, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00030134314126436455}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.02657200435528566, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011035850530213228}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.04083946585151625, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001540836819231046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.02785801817001536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0010086311227078628}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.027847356181606008, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0012301635576539068}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.041473623387033184, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0016278414776434958}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.028747368698632404, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0010989439749250103}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.1251055397122511, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.014484092743724803}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3f6959cc8713d07375be8028db26609b7cbbfbe9 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.00546467811919465, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0006252571429574801}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.007588094977815991, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007854778326407263}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.005312217579045015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0005447805558351684}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.00044339189228553947, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 9.961401108641653e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0006655482990875535, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001814575634003083}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0004573993622119604, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 9.93009701272525e-05}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.00461981647245416, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0005033099668854365}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.006788021218410852, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006881154075077389}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.0045916498026018614, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004489570697479766}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.004946550238006405, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0005649935199743343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.007016292908843859, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0007337348506316745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.004852566252508574, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004994751682757694}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.608658724355054e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.1367178740967643e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f3fc04cef125970c47c714dfdd6930edeeee1da7 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 0.06113312558172764, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01575172189851365}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.014710592817317623, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0003529315721245975}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.026426487712639557, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0006726265968105224}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.018526574610266466, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004491583645956695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.0004867368803311285, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.864693694964923e-05}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.0009789004289938045, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0001645546445630159}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.0006359667540750598, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0001041733500992725}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.014689240988132256, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00035074403880002916}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.026390402516371297, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0006695145672545542}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.018499794220557017, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00044650709340406276}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.013873433046158122, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0003117470202763751}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.024938658187947462, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005998309209761277}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.017471035147790994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0003971392931020627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..085c10bd3e2dd44e232e9e36fbdb2e57941afc63 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 5.8201208498355745, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07918003623517482}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.25691930916492756, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.002584112631865945}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.4396823419994151, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002924006390564013}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.2978972679324119, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001938270939643692}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.10986735481075849, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0016737706226266926}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.18795573844856492, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0020345189689034695}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.12463514115806919, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.001299783869728999}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2108434140927592, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021087197045539206}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3655838393962915, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0025164390500460707}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.24464314098146767, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014611191185898375}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.2157653393823411, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0024335105491588797}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36407468797098697, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026761965498774284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2473804112744828, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017953159637369525}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..7410540d3ff5ece31400122601297a4f1c582938 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.8213920736753035, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.09269849650267996}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.29394430959429885, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0029349960483685197}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.44889829988946167, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026008346109833076}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3268772449979945, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002037185940562139}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1343978103369073, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019313519405930251}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.20303674980966369, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019154842806783233}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1463066245452882, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014390553252013872}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2299825363481501, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021765504098611448}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.36308738357631903, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023177876330520717}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.25892225643390154, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00149376807780425}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.24820013238470168, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0027165241805849048}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.37474754473838195, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024667758421094027}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2738908855936707, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019355452082937584}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..4ee264b02f8e16e30f2b34f9b1c203531bb454a6 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 6.942009450714699, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04290812234950382}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.30906162515100283, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0030674030901339717}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.44080702325897825, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026014559705604258}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.33114374289634996, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019963064576782263}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.14318958988373076, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0020155780018855157}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.2013842488461, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0019010681475652355}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.14999298186863755, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014464336948624026}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.24226611193261477, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002287690467824282}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.35660966290964163, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002307251432723998}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.26265901338393766, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014797491179515284}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.26123456663930006, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0028608851643456765}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.3668591988145458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024536597226958376}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.27712766233037806, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019244037043805748}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..44391eb727301b7191928c3fdc8d75c638c08b36 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 7.50519306139533, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.12141535391721209}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.3382900801328086, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0031870177107722864}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.43823825667258115, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025150913221159833}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.35014858130045096, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020798805965393467}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.1562658891246859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002090506730559318}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.199427975848365, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018891086525461194}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1583408367591379, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015305939174917813}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.25544052575853743, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0023438446152810907}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.34203035767325535, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022562591465260075}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.26748399521207994, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001534553168600716}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.2842748733336647, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.002916261382644648}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36471610019410694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002385680406071139}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.2924671515205407, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001988178356101477}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6349500a7e069fe23726ad4606a0de4de056e782 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 8.909934598498245, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.11355686792329017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.37588510027488736, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0028779178551135293}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.43674914985883706, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0024638439091214433}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.3791546852434602, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020200693077103015}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.17238666817345458, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0019278848044138694}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.19980355257879728, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0018510324254879687}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.1726081857656977, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015716694288982242}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.2762695240145616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0021890512735288354}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3283605962966383, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002209924055940966}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.28073901470903295, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001614330726795435}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.3186956691262859, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0026612676205351017}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.36923270586040546, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023640716126389163}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.32085059046180675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019675465538199436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..b56f09801753cbd4de09dad1218dc33613b0197e --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.07192798210581976, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010917008658073372}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.17738230504867103, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023612645642943995}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.10102973537775274, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014355102738807643}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.004288158761745872, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002700287405913071}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.010847911832547128, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006940494564969847}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.006073255625355645, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003809878915466881}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.061860157155504046, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008875877382502058}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.15379181870207026, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019969008216420697}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.08708997150339674, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011768137916529835}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.061491345289910605, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009161557912449834}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.15291182401919987, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0020815555284572295}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.08656040328674863, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001216757422923546}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.06212804417145812, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.011971318709799528}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d999e2188c3ef783d42a41b1b23460d3e942caaa --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06505394396909397, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010345000379844767}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.16367528098007583, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002310164884000985}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.09194088936961643, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0013783104926480619}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.004211301109017673, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002712127686911097}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.010640558564493393, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006925012884316564}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.005968104627521815, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003824456236105915}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.05824585599255965, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008851968023125635}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.14716783464641015, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020015127545039207}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.08242443142898405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0011823524939069798}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05601648748687626, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008514365416564917}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.14220135901573897, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001984633594195078}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.07935310191197463, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0011428193509701132}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.07413849235928043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.01976351303187912}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..69c3aaaec893c6a0030439f702671469114026bc --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.05970144726407935, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010808131881698626}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.15032918128991743, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0025278263617417337}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.08444066513722136, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0014657483966925}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.004734886121730911, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00030986275466350464}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.012150770146827796, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008496354899205403}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00672052705909708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0004405404886083871}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0545565395708172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0009399168505439712}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.13773639312565078, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022148411509370623}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.07722118221070728, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012763559941414496}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.05216833775619172, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008929985372574541}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.1322606270465344, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002157181329873677}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.07391840483447609, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012190936937503734}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.10304318645626095, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.02590406105837305}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..58e7883028ca4b2fe6bf6b3d17388135c12011f3 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.05815946761165747, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001380812200134281}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.13549250184452813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0027208611626966957}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.07896539889705549, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0016540196570312491}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0051386914240500875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00040605626704506864}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.01205042518636505, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008745757698404825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.006943244163002554, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0005154856926939987}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.05358406774115243, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0011716163642417996}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.1260775001189485, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023957295090012844}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.07314249899760741, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014324655345679602}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.050377295082503465, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0011611779272879762}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.11809508568654702, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0022806675190111193}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.06851629797589617, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0013679563501104437}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.22427754093829802, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.034552127103896886}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6cfef250f681138c63e07a65c176a7e611a9de23 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.02214933884629678, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016410873850719364}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.03772298255486349, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0023501327317525427}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.024599325309196316, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015226073238554963}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0020692193883074857, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003062075285410719}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.004199075477211875, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006094344180167119}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0026139422307253986, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00037040229047969556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.020330750956918043, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001451030088641699}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.03484763177326402, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020977393818014333}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.022646617016988464, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0013512159194685597}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.01900606349570929, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014097333664542056}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.031891578587321945, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0019321769969979038}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.020880117459677255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0012684304431225134}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.20252486202824954, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0451733186712267}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..68c4b024bba74f231e0ec923ae3827e453aeb087 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/agg.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0019620307874586742, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0005451730856114239}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0017379629055225318, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0004813113588013356}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0018102133465432942, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0004995256780505414}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.00010762452493862039, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 7.62180455261845e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0001072041166380789, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 7.577221663047183e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.00010730891049598514, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 7.588337415201644e-05}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0017313225130562591, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0004677255640486146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0015468066285396214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0004190027142908825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0016046615232249279, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0004323354423624825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0017313225130562591, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0004677255640486146}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0015468066285396214, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004190027142908825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0016046615232249279, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004323354423624825}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.6165965257139506e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 3.022588684192216e-34}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c98782b302d8e5d1f569e42707fec37199550036 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:937878380ce866948abc73b1d11f7b65b4d3e31b7377f94063250b68e6ee8ce8 +size 4087992 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..375bd9478abed9bbb4e7dccd297790fdc8ee39b4 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07597bee99c45c1bb036df2c95683cfb565e85843b4d8ce7af1eb4e7eb685460 +size 4984460 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8044705e4e346f49e52407f213159125f295e989 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f426f9b5373cedc25d551008b40cc23f52a16ef4af342b5d863f084f3db568b +size 5892765 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8186ff76b90389e9d7f6b64d14e21af2297f84ec 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62790b7ad4fa4a2068b80c9b1305418ed7785c91204465fed6e4d58e5d6c3b6a +size 6798575 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a9fc50957555c2f282dce4f5f635e3aa770fc421 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:546037073d9384b3eec6019948af6d55f4dadd4b62725da47e1ab69641b0561e +size 7698677 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8544bd83a74ea5b34534c0a4f8850b49cc4841f4 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb579f67ec9ae5a87374e66eda36182c2670b195f840d303309922584a9485b +size 8616241 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e1dc593a1303772d3d2987c73031479a622f5776 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39c1b29aefbd63934ec688a25c912cc07e88c0d8b1dfab597a462b21d174635f +size 7512442 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..484a43e215b1133fa120b81b0bf2a76ecb5d99f0 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:317832b76334283980310b50ac60742a8d70b5d99a5563f32c4b3db1f1097561 +size 13280118 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f28626fe981d12d1c1ce405920426f1f4ea53bfc 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779e9a6ce4477f76b10ae3e6a0e4344c498a0a4069708c4926d83326e0744401 +size 18875248 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..79e1f9879d229a6ba087ae7b47b6eb29427afc3b 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28fbb89912ce8f2901f899e451ab5c9f62dca4d4817a246331df90e65ed4709 +size 24309060 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..951c9280b83dca7dc82930671388937f14344dd1 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99993dd0942b3a7b760bc89ee33db4fd8a029f6b5ca0baa67833c1ac31ae8dfe +size 29467642 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..958526c30559e9ded76c5f928e3b4cd1799c3ea6 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c94979586beec12e3ad216ef0bd97dd307c8294d77637dbbddb96400bd64167f +size 34798915 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..473805cec8526cfa4abdb02ce5f61bbc115500d8 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d67358046c33ba5f289cef250a3e58caf96a97f7ceaa07bcd5b1871fcff0fc +size 4512722 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..206be1effc531c6ebffcf2fad08a4f84ddd275b0 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20c54864d98cc6247b87d7230869f9fa09f96f5acc34e88bb732506762c2ea9a +size 5498944 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..541f36ea356c1787c53b8d3683d8320c5346bf2b 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5a4a55ea779c36438806dab3185115e13e618c03e4fc35e1426920883e39d08 +size 6522421 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cb845dd36a4c324770c5c6990110dfa766e2f613 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f98270faf192cea6a5226f2af3d4643f479bb71aebb3e10fa732a9758701636a +size 7572537 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..82da278c97d7b37e05d07137d6f40147ac683124 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00f5db8ff5a2361d6d84d4f1b9f51dd1a2ff9a1148a3137a08c0805046ae0c65 +size 8589702 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..91111ed2a6c6fb9dd4c4a37d3d4d65fba66e0f9c 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88252423e9702d8df47a8ce2a16ba92abd7b74b501bdcd6920ad7a146566d56d +size 9574953 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..76bb3a9fce1a8c1b7e97c4120d2f415b5fc3c813 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3631e40514e2b4ce75dd5e2f793e54fabe9a2743ce3ab5e1942d419cf8a68d5b +size 2839120 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2eb6079620346fe6c99b6828a804f131c0b4dc5d 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8f6db80f53b06aee3e2c31fb3c1a785f3bb2e2bc20585fdad5211ce4680a15f +size 5099494 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..5ecf8f42ab708ded48da2206b0da8c8a2e6400f6 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:588d1325de14188d478a0bb5d9058b5344cc40e35d4664bff3cf6511371d1a4e +size 7363054 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..02d6af72116321df93d2e44aae7bb2b0b293004b 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7be870aac5b20b911f2af7ac615e8a668478999c91021243a17bc864eef7b312 +size 9628689 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b05a109ce0edb5db71376451ae92b77de5a81726 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3314b7c6ffee70fc8aed101dfbcbed680a038b9acc01e1c4feb3d64c7b71b590 +size 11669604 diff --git a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0b07cb725c906f9a4456a9b0cc79aeae274430d8 100644 --- a/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.jsonl +++ b/4b284b12bc4seed4/evaluation/generation/examples.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae28f5e1335d0c6c2283fc51913626a20c1d6af90cb2e34d05e584cee2f1dcb9 +size 13897437 diff --git a/4b284b12bc4seed4/evaluation/generation/merged.csv b/4b284b12bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..2d07781073465dacaec1eaed7de32e589117bd39 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0006359667540750598 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0006359667540750598 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.12463514115806919 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.12463514115806919 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1463066245452882 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1463066245452882 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14999298186863755 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14999298186863755 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.1583408367591379 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.1583408367591379 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1726081857656977 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1726081857656977 +e2e_nlg_cleaned,5,average,multiple,0.1254199561418176 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.006073255625355645 +gem_xsum,0,median,rouge2_fmeasure,0.006073255625355645 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.005968104627521815 +gem_xsum,1,median,rouge2_fmeasure,0.005968104627521815 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.00672052705909708 +gem_xsum,2,median,rouge2_fmeasure,0.00672052705909708 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.006943244163002554 +gem_xsum,3,median,rouge2_fmeasure,0.006943244163002554 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.0026139422307253986 +gem_xsum,4,median,rouge2_fmeasure,0.0026139422307253986 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00010730891049598514 +gem_xsum,5,median,rouge2_fmeasure,0.00010730891049598514 +gem_xsum,5,average,multiple,0.004737730436033079 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05502825027290272 +web_nlg_en,0,median,rouge2_fmeasure,0.05502825027290272 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05811058513302895 +web_nlg_en,1,median,rouge2_fmeasure,0.05811058513302895 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05685211598675336 +web_nlg_en,2,median,rouge2_fmeasure,0.05685211598675336 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.054818451753863656 +web_nlg_en,3,median,rouge2_fmeasure,0.054818451753863656 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05493385212823661 +web_nlg_en,4,median,rouge2_fmeasure,0.05493385212823661 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05370114479085305 +web_nlg_en,5,median,rouge2_fmeasure,0.05370114479085305 +web_nlg_en,5,average,multiple,0.05557406667760639 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.0032754676506289828 +wiki_lingua_en,0,median,rouge2_fmeasure,0.0032754676506289828 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.004199685382427501 +wiki_lingua_en,1,median,rouge2_fmeasure,0.004199685382427501 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.004799135265562465 +wiki_lingua_en,2,median,rouge2_fmeasure,0.004799135265562465 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.005617198716065193 +wiki_lingua_en,3,median,rouge2_fmeasure,0.005617198716065193 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.0029260675465866874 +wiki_lingua_en,4,median,rouge2_fmeasure,0.0029260675465866874 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0004573993622119604 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0004573993622119604 +wiki_lingua_en,5,average,multiple,0.0035458256539137984 diff --git a/4b284b12bc4seed4/evaluation/generation/merged.json b/4b284b12bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..dd81d4114af2bf67eebc592bc25259e376d09674 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4255419689866024, "bleu_stderr": 0.04553671379431864, "rouge1_fmeasure": 0.11709985442438758, "rouge1_fmeasure_stderr": 0.0019235811070200824, "rouge1_precision": 0.07691162970808266, "rouge1_precision_stderr": 0.0015499735187471276, "rouge1_recall": 0.3361349423128226, "rouge1_recall_stderr": 0.004717900225751208, "rouge2_fmeasure": 0.05502825027290272, "rouge2_fmeasure_stderr": 0.0012188176771805944, "rouge2_precision": 0.03634094203430491, "rouge2_precision_stderr": 0.0010568736203451017, "rouge2_recall": 0.16382279258438331, "rouge2_recall_stderr": 0.0032641856127605708, "rougeL_fmeasure": 0.11222201441152387, "rougeL_fmeasure_stderr": 0.0017898759559440783, "rougeL_precision": 0.07347783656701605, "rougeL_precision_stderr": 0.0014400832686900517, "rougeL_recall": 0.325262737431151, "rougeL_recall_stderr": 0.0046026209075170806, "rougeLsum_fmeasure": 0.1102576752933881, "rougeLsum_fmeasure_stderr": 0.0017934408668905857, "rougeLsum_precision": 0.07247687306153981, "rougeLsum_precision_stderr": 0.0014666193171867594, "rougeLsum_recall": 0.31667827309434965, "rougeLsum_recall_stderr": 0.004394255465374989}}, "1": {"PALM_prompt": {"bleu": 0.4375148581871164, "bleu_stderr": 0.043362378909165875, "rouge1_fmeasure": 0.12097522483010485, "rouge1_fmeasure_stderr": 0.002202684090770471, "rouge1_precision": 0.084573986124589, "rouge1_precision_stderr": 0.0022156458659289445, "rouge1_recall": 0.32974434511651185, "rouge1_recall_stderr": 0.004760360381005011, "rouge2_fmeasure": 0.05811058513302895, "rouge2_fmeasure_stderr": 0.0014298876541859705, "rouge2_precision": 0.040740790020171655, "rouge2_precision_stderr": 0.0014239356080917533, "rouge2_recall": 0.16249675831000945, "rouge2_recall_stderr": 0.003343995882368083, "rougeL_fmeasure": 0.11484527988661357, "rougeL_fmeasure_stderr": 0.0020020380133125328, "rougeL_precision": 0.07979616615847458, "rougeL_precision_stderr": 0.0020050186616118407, "rougeL_recall": 0.31708796648649956, "rougeL_recall_stderr": 0.004617552099836135, "rougeLsum_fmeasure": 0.11394268478061556, "rougeLsum_fmeasure_stderr": 0.0020477402089164776, "rougeLsum_precision": 0.07967262232949184, "rougeLsum_precision_stderr": 0.002074014298894367, "rougeLsum_recall": 0.3110596157987853, "rougeLsum_recall_stderr": 0.004435869800526469}}, "2": {"PALM_prompt": {"bleu": 0.43895421656508316, "bleu_stderr": 0.03560314454345476, "rouge1_fmeasure": 0.11837525964996032, "rouge1_fmeasure_stderr": 0.0020736790107389433, "rouge1_precision": 0.07893892499245164, "rouge1_precision_stderr": 0.0017559781579577936, "rouge1_recall": 0.33147399394577687, "rouge1_recall_stderr": 0.004792853072557146, "rouge2_fmeasure": 0.05685211598675336, "rouge2_fmeasure_stderr": 0.0013242460972892624, "rouge2_precision": 0.03778130469895934, "rouge2_precision_stderr": 0.0011220289463223655, "rouge2_recall": 0.16607823925825196, "rouge2_recall_stderr": 0.003428415308098998, "rougeL_fmeasure": 0.11305526526705706, "rougeL_fmeasure_stderr": 0.0019076101565052952, "rougeL_precision": 0.0751384880267446, "rougeL_precision_stderr": 0.0016250699836019358, "rougeL_recall": 0.31955459951422993, "rougeL_recall_stderr": 0.004633747457550051, "rougeLsum_fmeasure": 0.11212261864528106, "rougeLsum_fmeasure_stderr": 0.0019332204600801869, "rougeLsum_precision": 0.07484212840598793, "rougeLsum_precision_stderr": 0.0016673128268097014, "rougeLsum_recall": 0.31478623106918857, "rougeLsum_recall_stderr": 0.004462671481456754}}, "3": {"PALM_prompt": {"bleu": 0.4055173976621406, "bleu_stderr": 0.03388942315369854, "rouge1_fmeasure": 0.11521270106633405, "rouge1_fmeasure_stderr": 0.0020223822680663353, "rouge1_precision": 0.07564013742377862, "rouge1_precision_stderr": 0.001624130926371493, "rouge1_recall": 0.32832027029775396, "rouge1_recall_stderr": 0.004754946563417434, "rouge2_fmeasure": 0.054818451753863656, "rouge2_fmeasure_stderr": 0.0012891162497869618, "rouge2_precision": 0.03600203329160531, "rouge2_precision_stderr": 0.0010611943954759993, "rouge2_recall": 0.16248350820338053, "rouge2_recall_stderr": 0.003392520543251839, "rougeL_fmeasure": 0.11001628031350326, "rougeL_fmeasure_stderr": 0.0018702505486722842, "rougeL_precision": 0.07201677299268112, "rougeL_precision_stderr": 0.0015006281609086204, "rougeL_recall": 0.31619306971917027, "rougeL_recall_stderr": 0.004595788592278942, "rougeLsum_fmeasure": 0.10876033036378392, "rougeLsum_fmeasure_stderr": 0.00187477057348602, "rougeLsum_precision": 0.07139551984673621, "rougeLsum_precision_stderr": 0.001521052983920996, "rougeLsum_recall": 0.3106722185590567, "rougeLsum_recall_stderr": 0.004410951996082152}}, "4": {"PALM_prompt": {"bleu": 0.4219802204390914, "bleu_stderr": 0.035062189637818944, "rouge1_fmeasure": 0.11435782016646073, "rouge1_fmeasure_stderr": 0.0019930832962392476, "rouge1_precision": 0.07604999251191036, "rouge1_precision_stderr": 0.001694487867306116, "rouge1_recall": 0.32739311904589957, "rouge1_recall_stderr": 0.004733990565227727, "rouge2_fmeasure": 0.05493385212823661, "rouge2_fmeasure_stderr": 0.0012747653539780157, "rouge2_precision": 0.03608787195276645, "rouge2_precision_stderr": 0.0009682458617317419, "rouge2_recall": 0.16298585196913531, "rouge2_recall_stderr": 0.0033617720441310407, "rougeL_fmeasure": 0.1087646065353738, "rougeL_fmeasure_stderr": 0.0018365048381652636, "rougeL_precision": 0.0719402479836372, "rougeL_precision_stderr": 0.0015114912843032233, "rougeL_recall": 0.3140568060068406, "rougeL_recall_stderr": 0.004584266596245509, "rougeLsum_fmeasure": 0.10814461597064734, "rougeLsum_fmeasure_stderr": 0.0018690945883989335, "rougeLsum_precision": 0.0717813619994629, "rougeLsum_precision_stderr": 0.0015531796251007297, "rougeLsum_recall": 0.30959297417269843, "rougeLsum_recall_stderr": 0.004389147529346589}}, "5": {"PALM_prompt": {"bleu": 0.41421236223783364, "bleu_stderr": 0.03144881485668538, "rouge1_fmeasure": 0.11235776546508958, "rouge1_fmeasure_stderr": 0.0019443755895376892, "rouge1_precision": 0.07376815882972376, "rouge1_precision_stderr": 0.0015413873275619697, "rouge1_recall": 0.3269107430349855, "rouge1_recall_stderr": 0.004768390550563892, "rouge2_fmeasure": 0.05370114479085305, "rouge2_fmeasure_stderr": 0.0012337413746734015, "rouge2_precision": 0.03507239155053733, "rouge2_precision_stderr": 0.000919535201162236, "rouge2_recall": 0.16374442513917464, "rouge2_recall_stderr": 0.003472789622253923, "rougeL_fmeasure": 0.10679593308879384, "rougeL_fmeasure_stderr": 0.0017931240322613966, "rougeL_precision": 0.06977608287770509, "rougeL_precision_stderr": 0.001374956676465801, "rougeL_recall": 0.31368077194090394, "rougeL_recall_stderr": 0.004624757562111944, "rougeLsum_fmeasure": 0.1058735384538628, "rougeLsum_fmeasure_stderr": 0.001807570321110346, "rougeLsum_precision": 0.06944197659188452, "rougeLsum_precision_stderr": 0.001414266115795263, "rougeLsum_recall": 0.30880609903141404, "rougeLsum_recall_stderr": 0.004454929611901481}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.0489790785966099, "bleu_stderr": 0.006779339030578523, "rouge1_fmeasure": 0.070437989859743, "rouge1_fmeasure_stderr": 0.0010312902668252013, "rouge1_precision": 0.06498870832691263, "rouge1_precision_stderr": 0.0010723261619156347, "rouge1_recall": 0.09208329086891458, "rouge1_recall_stderr": 0.0014107679511711905, "rouge2_fmeasure": 0.0032754676506289828, "rouge2_fmeasure_stderr": 0.00018463100249520333, "rouge2_precision": 0.0031259330192821917, "rouge2_precision_stderr": 0.0001792020567470567, "rouge2_recall": 0.0041562592064827136, "rouge2_recall_stderr": 0.0002654234746028053, "rougeL_fmeasure": 0.0638808755302006, "rougeL_fmeasure_stderr": 0.0008731247333910541, "rougeL_precision": 0.058599346602679785, "rougeL_precision_stderr": 0.0008924974727175551, "rougeL_recall": 0.08428848354699885, "rougeL_recall_stderr": 0.001258671638105778, "rougeLsum_fmeasure": 0.06702516127871343, "rougeLsum_fmeasure_stderr": 0.0009693030073720331, "rougeLsum_precision": 0.06184900174439773, "rougeLsum_precision_stderr": 0.001011647200269575, "rougeLsum_recall": 0.0878197954616511, "rougeLsum_recall_stderr": 0.0013420548769352185}}, "1": {"tldr_en": {"bleu": 0.19435530388407438, "bleu_stderr": 0.027474135486464886, "rouge1_fmeasure": 0.08534345009310793, "rouge1_fmeasure_stderr": 0.0012180416276689969, "rouge1_precision": 0.07527407433133869, "rouge1_precision_stderr": 0.0012353167292243439, "rouge1_recall": 0.11879070038037605, "rouge1_recall_stderr": 0.001666703743793988, "rouge2_fmeasure": 0.004199685382427501, "rouge2_fmeasure_stderr": 0.0002477120467452383, "rouge2_precision": 0.0037334174812980337, "rouge2_precision_stderr": 0.00022288691443581962, "rouge2_recall": 0.005917125245253962, "rouge2_recall_stderr": 0.0003984864275118756, "rougeL_fmeasure": 0.0779496715247138, "rougeL_fmeasure_stderr": 0.00102035551714523, "rougeL_precision": 0.06809342794848382, "rougeL_precision_stderr": 0.0010182876153051106, "rougeL_recall": 0.11003512325420707, "rougeL_recall_stderr": 0.0014770539784070803, "rougeLsum_fmeasure": 0.08013661978467537, "rougeLsum_fmeasure_stderr": 0.0011236586288798575, "rougeLsum_precision": 0.07054606286211522, "rougeLsum_precision_stderr": 0.001139963924498013, "rougeLsum_recall": 0.11204758740190628, "rougeLsum_recall_stderr": 0.0015572733289150008}}, "2": {"tldr_en": {"bleu": 0.26166447349576716, "bleu_stderr": 0.02664531207793444, "rouge1_fmeasure": 0.09264326558315315, "rouge1_fmeasure_stderr": 0.0012341782878536826, "rouge1_precision": 0.08188916742697006, "rouge1_precision_stderr": 0.0012604740470487253, "rouge1_recall": 0.12863501946937672, "rouge1_recall_stderr": 0.0017306118753458995, "rouge2_fmeasure": 0.004799135265562465, "rouge2_fmeasure_stderr": 0.0003258383358742087, "rouge2_precision": 0.00417689519312021, "rouge2_precision_stderr": 0.00027799524088823336, "rouge2_recall": 0.0070394381668830155, "rouge2_recall_stderr": 0.000549533007866243, "rougeL_fmeasure": 0.08530978440523501, "rougeL_fmeasure_stderr": 0.0010419970840251892, "rougeL_precision": 0.07469229492018686, "rougeL_precision_stderr": 0.0010462849454463273, "rougeL_recall": 0.11996588099996168, "rougeL_recall_stderr": 0.0015431639618236344, "rougeLsum_fmeasure": 0.08652135177894667, "rougeLsum_fmeasure_stderr": 0.0011347677035999774, "rougeLsum_precision": 0.07637785471073269, "rougeLsum_precision_stderr": 0.0011637595201347114, "rougeLsum_recall": 0.12067856570386956, "rougeLsum_recall_stderr": 0.0016119906969829974}}, "3": {"tldr_en": {"bleu": 0.35676801434287403, "bleu_stderr": 0.04322442294335073, "rouge1_fmeasure": 0.08496018846964883, "rouge1_fmeasure_stderr": 0.0013534144382857925, "rouge1_precision": 0.07918559281464797, "rouge1_precision_stderr": 0.0015732102907079421, "rouge1_recall": 0.11713311231879246, "rouge1_recall_stderr": 0.0018694365480156567, "rouge2_fmeasure": 0.005617198716065193, "rouge2_fmeasure_stderr": 0.0003504250975971709, "rouge2_precision": 0.005450855204647191, "rouge2_precision_stderr": 0.00042451344437272563, "rouge2_recall": 0.00786801734624306, "rouge2_recall_stderr": 0.0005418684959080103, "rougeL_fmeasure": 0.07686453493425864, "rougeL_fmeasure_stderr": 0.0011252346058277117, "rougeL_precision": 0.07101336043004314, "rougeL_precision_stderr": 0.001347924853763445, "rougeL_recall": 0.10774352434928229, "rougeL_recall_stderr": 0.0016488819148997246, "rougeLsum_fmeasure": 0.07875700339703082, "rougeLsum_fmeasure_stderr": 0.0012486260433925756, "rougeLsum_precision": 0.07334057105853452, "rougeLsum_precision_stderr": 0.00146993475075726, "rougeLsum_recall": 0.108988272664584, "rougeLsum_recall_stderr": 0.0017377582372509691}}, "4": {"tldr_en": {"bleu": 0.1251055397122511, "bleu_stderr": 0.014484092743724803, "rouge1_fmeasure": 0.03139283202672829, "rouge1_fmeasure_stderr": 0.0012003402420825678, "rouge1_precision": 0.030329065224279812, "rouge1_precision_stderr": 0.001330278405910414, "rouge1_recall": 0.04517297214250825, "rouge1_recall_stderr": 0.001758818657887946, "rouge2_fmeasure": 0.0029260675465866874, "rouge2_fmeasure_stderr": 0.00030134314126436455, "rouge2_precision": 0.002783763831604278, "rouge2_precision_stderr": 0.0003242150810229513, "rouge2_recall": 0.0044964737985171525, "rouge2_recall_stderr": 0.0005138289413575867, "rougeL_fmeasure": 0.02785801817001536, "rougeL_fmeasure_stderr": 0.0010086311227078628, "rougeL_precision": 0.02657200435528566, "rougeL_precision_stderr": 0.0011035850530213228, "rougeL_recall": 0.04083946585151625, "rougeL_recall_stderr": 0.001540836819231046, "rougeLsum_fmeasure": 0.028747368698632404, "rougeLsum_fmeasure_stderr": 0.0010989439749250103, "rougeLsum_precision": 0.027847356181606008, "rougeLsum_precision_stderr": 0.0012301635576539068, "rougeLsum_recall": 0.041473623387033184, "rougeLsum_recall_stderr": 0.0016278414776434958}}, "5": {"tldr_en": {"bleu": 1.608658724355054e-07, "bleu_stderr": 4.1367178740967643e-07, "rouge1_fmeasure": 0.005312217579045015, "rouge1_fmeasure_stderr": 0.0005447805558351684, "rouge1_precision": 0.00546467811919465, "rouge1_precision_stderr": 0.0006252571429574801, "rouge1_recall": 0.007588094977815991, "rouge1_recall_stderr": 0.0007854778326407263, "rouge2_fmeasure": 0.0004573993622119604, "rouge2_fmeasure_stderr": 9.93009701272525e-05, "rouge2_precision": 0.00044339189228553947, "rouge2_precision_stderr": 9.961401108641653e-05, "rouge2_recall": 0.0006655482990875535, "rouge2_recall_stderr": 0.0001814575634003083, "rougeL_fmeasure": 0.0045916498026018614, "rougeL_fmeasure_stderr": 0.0004489570697479766, "rougeL_precision": 0.00461981647245416, "rougeL_precision_stderr": 0.0005033099668854365, "rougeL_recall": 0.006788021218410852, "rougeL_recall_stderr": 0.0006881154075077389, "rougeLsum_fmeasure": 0.004852566252508574, "rougeLsum_fmeasure_stderr": 0.0004994751682757694, "rougeLsum_precision": 0.004946550238006405, "rougeLsum_precision_stderr": 0.0005649935199743343, "rougeLsum_recall": 0.007016292908843859, "rougeLsum_recall_stderr": 0.0007337348506316745}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.06113312558172764, "bleu_stderr": 0.01575172189851365, "rouge1_fmeasure": 0.018526574610266466, "rouge1_fmeasure_stderr": 0.0004491583645956695, "rouge1_precision": 0.014710592817317623, "rouge1_precision_stderr": 0.0003529315721245975, "rouge1_recall": 0.026426487712639557, "rouge1_recall_stderr": 0.0006726265968105224, "rouge2_fmeasure": 0.0006359667540750598, "rouge2_fmeasure_stderr": 0.0001041733500992725, "rouge2_precision": 0.0004867368803311285, "rouge2_precision_stderr": 7.864693694964923e-05, "rouge2_recall": 0.0009789004289938045, "rouge2_recall_stderr": 0.0001645546445630159, "rougeL_fmeasure": 0.018499794220557017, "rougeL_fmeasure_stderr": 0.00044650709340406276, "rougeL_precision": 0.014689240988132256, "rougeL_precision_stderr": 0.00035074403880002916, "rougeL_recall": 0.026390402516371297, "rougeL_recall_stderr": 0.0006695145672545542, "rougeLsum_fmeasure": 0.017471035147790994, "rougeLsum_fmeasure_stderr": 0.0003971392931020627, "rougeLsum_precision": 0.013873433046158122, "rougeLsum_precision_stderr": 0.0003117470202763751, "rougeLsum_recall": 0.024938658187947462, "rougeLsum_recall_stderr": 0.0005998309209761277}}, "1": {"generate_text_restaurant": {"bleu": 5.8201208498355745, "bleu_stderr": 0.07918003623517482, "rouge1_fmeasure": 0.2978972679324119, "rouge1_fmeasure_stderr": 0.001938270939643692, "rouge1_precision": 0.25691930916492756, "rouge1_precision_stderr": 0.002584112631865945, "rouge1_recall": 0.4396823419994151, "rouge1_recall_stderr": 0.002924006390564013, "rouge2_fmeasure": 0.12463514115806919, "rouge2_fmeasure_stderr": 0.001299783869728999, "rouge2_precision": 0.10986735481075849, "rouge2_precision_stderr": 0.0016737706226266926, "rouge2_recall": 0.18795573844856492, "rouge2_recall_stderr": 0.0020345189689034695, "rougeL_fmeasure": 0.24464314098146767, "rougeL_fmeasure_stderr": 0.0014611191185898375, "rougeL_precision": 0.2108434140927592, "rougeL_precision_stderr": 0.0021087197045539206, "rougeL_recall": 0.3655838393962915, "rougeL_recall_stderr": 0.0025164390500460707, "rougeLsum_fmeasure": 0.2473804112744828, "rougeLsum_fmeasure_stderr": 0.0017953159637369525, "rougeLsum_precision": 0.2157653393823411, "rougeLsum_precision_stderr": 0.0024335105491588797, "rougeLsum_recall": 0.36407468797098697, "rougeLsum_recall_stderr": 0.0026761965498774284}}, "2": {"generate_text_restaurant": {"bleu": 6.8213920736753035, "bleu_stderr": 0.09269849650267996, "rouge1_fmeasure": 0.3268772449979945, "rouge1_fmeasure_stderr": 0.002037185940562139, "rouge1_precision": 0.29394430959429885, "rouge1_precision_stderr": 0.0029349960483685197, "rouge1_recall": 0.44889829988946167, "rouge1_recall_stderr": 0.0026008346109833076, "rouge2_fmeasure": 0.1463066245452882, "rouge2_fmeasure_stderr": 0.0014390553252013872, "rouge2_precision": 0.1343978103369073, "rouge2_precision_stderr": 0.0019313519405930251, "rouge2_recall": 0.20303674980966369, "rouge2_recall_stderr": 0.0019154842806783233, "rougeL_fmeasure": 0.25892225643390154, "rougeL_fmeasure_stderr": 0.00149376807780425, "rougeL_precision": 0.2299825363481501, "rougeL_precision_stderr": 0.0021765504098611448, "rougeL_recall": 0.36308738357631903, "rougeL_recall_stderr": 0.0023177876330520717, "rougeLsum_fmeasure": 0.2738908855936707, "rougeLsum_fmeasure_stderr": 0.0019355452082937584, "rougeLsum_precision": 0.24820013238470168, "rougeLsum_precision_stderr": 0.0027165241805849048, "rougeLsum_recall": 0.37474754473838195, "rougeLsum_recall_stderr": 0.0024667758421094027}}, "3": {"generate_text_restaurant": {"bleu": 6.942009450714699, "bleu_stderr": 0.04290812234950382, "rouge1_fmeasure": 0.33114374289634996, "rouge1_fmeasure_stderr": 0.0019963064576782263, "rouge1_precision": 0.30906162515100283, "rouge1_precision_stderr": 0.0030674030901339717, "rouge1_recall": 0.44080702325897825, "rouge1_recall_stderr": 0.0026014559705604258, "rouge2_fmeasure": 0.14999298186863755, "rouge2_fmeasure_stderr": 0.0014464336948624026, "rouge2_precision": 0.14318958988373076, "rouge2_precision_stderr": 0.0020155780018855157, "rouge2_recall": 0.2013842488461, "rouge2_recall_stderr": 0.0019010681475652355, "rougeL_fmeasure": 0.26265901338393766, "rougeL_fmeasure_stderr": 0.0014797491179515284, "rougeL_precision": 0.24226611193261477, "rougeL_precision_stderr": 0.002287690467824282, "rougeL_recall": 0.35660966290964163, "rougeL_recall_stderr": 0.002307251432723998, "rougeLsum_fmeasure": 0.27712766233037806, "rougeLsum_fmeasure_stderr": 0.0019244037043805748, "rougeLsum_precision": 0.26123456663930006, "rougeLsum_precision_stderr": 0.0028608851643456765, "rougeLsum_recall": 0.3668591988145458, "rougeLsum_recall_stderr": 0.0024536597226958376}}, "4": {"generate_text_restaurant": {"bleu": 7.50519306139533, "bleu_stderr": 0.12141535391721209, "rouge1_fmeasure": 0.35014858130045096, "rouge1_fmeasure_stderr": 0.0020798805965393467, "rouge1_precision": 0.3382900801328086, "rouge1_precision_stderr": 0.0031870177107722864, "rouge1_recall": 0.43823825667258115, "rouge1_recall_stderr": 0.0025150913221159833, "rouge2_fmeasure": 0.1583408367591379, "rouge2_fmeasure_stderr": 0.0015305939174917813, "rouge2_precision": 0.1562658891246859, "rouge2_precision_stderr": 0.002090506730559318, "rouge2_recall": 0.199427975848365, "rouge2_recall_stderr": 0.0018891086525461194, "rougeL_fmeasure": 0.26748399521207994, "rougeL_fmeasure_stderr": 0.001534553168600716, "rougeL_precision": 0.25544052575853743, "rougeL_precision_stderr": 0.0023438446152810907, "rougeL_recall": 0.34203035767325535, "rougeL_recall_stderr": 0.0022562591465260075, "rougeLsum_fmeasure": 0.2924671515205407, "rougeLsum_fmeasure_stderr": 0.001988178356101477, "rougeLsum_precision": 0.2842748733336647, "rougeLsum_precision_stderr": 0.002916261382644648, "rougeLsum_recall": 0.36471610019410694, "rougeLsum_recall_stderr": 0.002385680406071139}}, "5": {"generate_text_restaurant": {"bleu": 8.909934598498245, "bleu_stderr": 0.11355686792329017, "rouge1_fmeasure": 0.3791546852434602, "rouge1_fmeasure_stderr": 0.0020200693077103015, "rouge1_precision": 0.37588510027488736, "rouge1_precision_stderr": 0.0028779178551135293, "rouge1_recall": 0.43674914985883706, "rouge1_recall_stderr": 0.0024638439091214433, "rouge2_fmeasure": 0.1726081857656977, "rouge2_fmeasure_stderr": 0.0015716694288982242, "rouge2_precision": 0.17238666817345458, "rouge2_precision_stderr": 0.0019278848044138694, "rouge2_recall": 0.19980355257879728, "rouge2_recall_stderr": 0.0018510324254879687, "rougeL_fmeasure": 0.28073901470903295, "rougeL_fmeasure_stderr": 0.001614330726795435, "rougeL_precision": 0.2762695240145616, "rougeL_precision_stderr": 0.0021890512735288354, "rougeL_recall": 0.3283605962966383, "rougeL_recall_stderr": 0.002209924055940966, "rougeLsum_fmeasure": 0.32085059046180675, "rougeLsum_fmeasure_stderr": 0.0019675465538199436, "rougeLsum_precision": 0.3186956691262859, "rougeLsum_precision_stderr": 0.0026612676205351017, "rougeLsum_recall": 0.36923270586040546, "rougeLsum_recall_stderr": 0.0023640716126389163}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.06212804417145812, "bleu_stderr": 0.011971318709799528, "rouge1_fmeasure": 0.10102973537775274, "rouge1_fmeasure_stderr": 0.0014355102738807643, "rouge1_precision": 0.07192798210581976, "rouge1_precision_stderr": 0.0010917008658073372, "rouge1_recall": 0.17738230504867103, "rouge1_recall_stderr": 0.0023612645642943995, "rouge2_fmeasure": 0.006073255625355645, "rouge2_fmeasure_stderr": 0.0003809878915466881, "rouge2_precision": 0.004288158761745872, "rouge2_precision_stderr": 0.0002700287405913071, "rouge2_recall": 0.010847911832547128, "rouge2_recall_stderr": 0.0006940494564969847, "rougeL_fmeasure": 0.08708997150339674, "rougeL_fmeasure_stderr": 0.0011768137916529835, "rougeL_precision": 0.061860157155504046, "rougeL_precision_stderr": 0.0008875877382502058, "rougeL_recall": 0.15379181870207026, "rougeL_recall_stderr": 0.0019969008216420697, "rougeLsum_fmeasure": 0.08656040328674863, "rougeLsum_fmeasure_stderr": 0.001216757422923546, "rougeLsum_precision": 0.061491345289910605, "rougeLsum_precision_stderr": 0.0009161557912449834, "rougeLsum_recall": 0.15291182401919987, "rougeLsum_recall_stderr": 0.0020815555284572295}}, "1": {"article_DOC_summary": {"bleu": 0.07413849235928043, "bleu_stderr": 0.01976351303187912, "rouge1_fmeasure": 0.09194088936961643, "rouge1_fmeasure_stderr": 0.0013783104926480619, "rouge1_precision": 0.06505394396909397, "rouge1_precision_stderr": 0.0010345000379844767, "rouge1_recall": 0.16367528098007583, "rouge1_recall_stderr": 0.002310164884000985, "rouge2_fmeasure": 0.005968104627521815, "rouge2_fmeasure_stderr": 0.0003824456236105915, "rouge2_precision": 0.004211301109017673, "rouge2_precision_stderr": 0.0002712127686911097, "rouge2_recall": 0.010640558564493393, "rouge2_recall_stderr": 0.0006925012884316564, "rougeL_fmeasure": 0.08242443142898405, "rougeL_fmeasure_stderr": 0.0011823524939069798, "rougeL_precision": 0.05824585599255965, "rougeL_precision_stderr": 0.0008851968023125635, "rougeL_recall": 0.14716783464641015, "rougeL_recall_stderr": 0.0020015127545039207, "rougeLsum_fmeasure": 0.07935310191197463, "rougeLsum_fmeasure_stderr": 0.0011428193509701132, "rougeLsum_precision": 0.05601648748687626, "rougeLsum_precision_stderr": 0.0008514365416564917, "rougeLsum_recall": 0.14220135901573897, "rougeLsum_recall_stderr": 0.001984633594195078}}, "2": {"article_DOC_summary": {"bleu": 0.10304318645626095, "bleu_stderr": 0.02590406105837305, "rouge1_fmeasure": 0.08444066513722136, "rouge1_fmeasure_stderr": 0.0014657483966925, "rouge1_precision": 0.05970144726407935, "rouge1_precision_stderr": 0.0010808131881698626, "rouge1_recall": 0.15032918128991743, "rouge1_recall_stderr": 0.0025278263617417337, "rouge2_fmeasure": 0.00672052705909708, "rouge2_fmeasure_stderr": 0.0004405404886083871, "rouge2_precision": 0.004734886121730911, "rouge2_precision_stderr": 0.00030986275466350464, "rouge2_recall": 0.012150770146827796, "rouge2_recall_stderr": 0.0008496354899205403, "rougeL_fmeasure": 0.07722118221070728, "rougeL_fmeasure_stderr": 0.0012763559941414496, "rougeL_precision": 0.0545565395708172, "rougeL_precision_stderr": 0.0009399168505439712, "rougeL_recall": 0.13773639312565078, "rougeL_recall_stderr": 0.0022148411509370623, "rougeLsum_fmeasure": 0.07391840483447609, "rougeLsum_fmeasure_stderr": 0.0012190936937503734, "rougeLsum_precision": 0.05216833775619172, "rougeLsum_precision_stderr": 0.0008929985372574541, "rougeLsum_recall": 0.1322606270465344, "rougeLsum_recall_stderr": 0.002157181329873677}}, "3": {"article_DOC_summary": {"bleu": 0.22427754093829802, "bleu_stderr": 0.034552127103896886, "rouge1_fmeasure": 0.07896539889705549, "rouge1_fmeasure_stderr": 0.0016540196570312491, "rouge1_precision": 0.05815946761165747, "rouge1_precision_stderr": 0.001380812200134281, "rouge1_recall": 0.13549250184452813, "rouge1_recall_stderr": 0.0027208611626966957, "rouge2_fmeasure": 0.006943244163002554, "rouge2_fmeasure_stderr": 0.0005154856926939987, "rouge2_precision": 0.0051386914240500875, "rouge2_precision_stderr": 0.00040605626704506864, "rouge2_recall": 0.01205042518636505, "rouge2_recall_stderr": 0.0008745757698404825, "rougeL_fmeasure": 0.07314249899760741, "rougeL_fmeasure_stderr": 0.0014324655345679602, "rougeL_precision": 0.05358406774115243, "rougeL_precision_stderr": 0.0011716163642417996, "rougeL_recall": 0.1260775001189485, "rougeL_recall_stderr": 0.0023957295090012844, "rougeLsum_fmeasure": 0.06851629797589617, "rougeLsum_fmeasure_stderr": 0.0013679563501104437, "rougeLsum_precision": 0.050377295082503465, "rougeLsum_precision_stderr": 0.0011611779272879762, "rougeLsum_recall": 0.11809508568654702, "rougeLsum_recall_stderr": 0.0022806675190111193}}, "4": {"article_DOC_summary": {"bleu": 0.20252486202824954, "bleu_stderr": 0.0451733186712267, "rouge1_fmeasure": 0.024599325309196316, "rouge1_fmeasure_stderr": 0.0015226073238554963, "rouge1_precision": 0.02214933884629678, "rouge1_precision_stderr": 0.0016410873850719364, "rouge1_recall": 0.03772298255486349, "rouge1_recall_stderr": 0.0023501327317525427, "rouge2_fmeasure": 0.0026139422307253986, "rouge2_fmeasure_stderr": 0.00037040229047969556, "rouge2_precision": 0.0020692193883074857, "rouge2_precision_stderr": 0.0003062075285410719, "rouge2_recall": 0.004199075477211875, "rouge2_recall_stderr": 0.0006094344180167119, "rougeL_fmeasure": 0.022646617016988464, "rougeL_fmeasure_stderr": 0.0013512159194685597, "rougeL_precision": 0.020330750956918043, "rougeL_precision_stderr": 0.001451030088641699, "rougeL_recall": 0.03484763177326402, "rougeL_recall_stderr": 0.0020977393818014333, "rougeLsum_fmeasure": 0.020880117459677255, "rougeLsum_fmeasure_stderr": 0.0012684304431225134, "rougeLsum_precision": 0.01900606349570929, "rougeLsum_precision_stderr": 0.0014097333664542056, "rougeLsum_recall": 0.031891578587321945, "rougeLsum_recall_stderr": 0.0019321769969979038}}, "5": {"article_DOC_summary": {"bleu": 1.6165965257139506e-38, "bleu_stderr": 3.022588684192216e-34, "rouge1_fmeasure": 0.0018102133465432942, "rouge1_fmeasure_stderr": 0.0004995256780505414, "rouge1_precision": 0.0019620307874586742, "rouge1_precision_stderr": 0.0005451730856114239, "rouge1_recall": 0.0017379629055225318, "rouge1_recall_stderr": 0.0004813113588013356, "rouge2_fmeasure": 0.00010730891049598514, "rouge2_fmeasure_stderr": 7.588337415201644e-05, "rouge2_precision": 0.00010762452493862039, "rouge2_precision_stderr": 7.62180455261845e-05, "rouge2_recall": 0.0001072041166380789, "rouge2_recall_stderr": 7.577221663047183e-05, "rougeL_fmeasure": 0.0016046615232249279, "rougeL_fmeasure_stderr": 0.0004323354423624825, "rougeL_precision": 0.0017313225130562591, "rougeL_precision_stderr": 0.0004677255640486146, "rougeL_recall": 0.0015468066285396214, "rougeL_recall_stderr": 0.0004190027142908825, "rougeLsum_fmeasure": 0.0016046615232249279, "rougeLsum_fmeasure_stderr": 0.0004323354423624825, "rougeLsum_precision": 0.0017313225130562591, "rougeLsum_precision_stderr": 0.0004677255640486146, "rougeLsum_recall": 0.0015468066285396214, "rougeLsum_recall_stderr": 0.0004190027142908825}}}} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6054590d734019ff112020a13bc76a61f429d6c5 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4255419689866024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04553671379431864 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07691162970808266, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015499735187471276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3361349423128226, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004717900225751208 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11709985442438758, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019235811070200824 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03634094203430491, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010568736203451017 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16382279258438331, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032641856127605708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05502825027290272, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012188176771805944 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07347783656701605, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014400832686900517 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.325262737431151, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046026209075170806 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11222201441152387, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017898759559440783 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07247687306153981, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014666193171867594 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31667827309434965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004394255465374989 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1102576752933881, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017934408668905857 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e4ba59396c5be752b1e544ac963eec3a2a75d074 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4375148581871164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.043362378909165875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.084573986124589, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022156458659289445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32974434511651185, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004760360381005011 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12097522483010485, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002202684090770471 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.040740790020171655, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0014239356080917533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16249675831000945, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003343995882368083 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05811058513302895, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0014298876541859705 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07979616615847458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020050186616118407 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31708796648649956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004617552099836135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11484527988661357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0020020380133125328 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07967262232949184, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002074014298894367 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3110596157987853, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004435869800526469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11394268478061556, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020477402089164776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..332fcb62d0da40c8e740b270be536709dac4a050 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.43895421656508316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03560314454345476 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07893892499245164, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017559781579577936 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33147399394577687, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004792853072557146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11837525964996032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020736790107389433 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03778130469895934, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011220289463223655 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16607823925825196, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003428415308098998 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05685211598675336, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013242460972892624 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0751384880267446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016250699836019358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31955459951422993, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004633747457550051 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11305526526705706, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019076101565052952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07484212840598793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016673128268097014 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31478623106918857, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004462671481456754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11212261864528106, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019332204600801869 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ae48b6d84197cb74ea1584e5b09b6adc8f278dcf --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4055173976621406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03388942315369854 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07564013742377862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001624130926371493 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32832027029775396, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004754946563417434 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11521270106633405, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020223822680663353 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03600203329160531, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010611943954759993 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16248350820338053, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003392520543251839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.054818451753863656, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012891162497869618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07201677299268112, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015006281609086204 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31619306971917027, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004595788592278942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11001628031350326, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018702505486722842 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07139551984673621, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001521052983920996 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3106722185590567, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004410951996082152 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10876033036378392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00187477057348602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..d48021708fc3c742d05a89d4eb9dc7fbfeec8f85 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4219802204390914, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.035062189637818944 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07604999251191036, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001694487867306116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32739311904589957, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004733990565227727 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11435782016646073, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019930832962392476 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03608787195276645, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009682458617317419 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16298585196913531, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033617720441310407 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05493385212823661, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012747653539780157 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0719402479836372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015114912843032233 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3140568060068406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004584266596245509 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1087646065353738, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018365048381652636 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0717813619994629, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015531796251007297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30959297417269843, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004389147529346589 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10814461597064734, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018690945883989335 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e82cb1282e053744603fd0d216a842655e428ee3 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.41421236223783364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03144881485668538 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07376815882972376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015413873275619697 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3269107430349855, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004768390550563892 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11235776546508958, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019443755895376892 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03507239155053733, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000919535201162236 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16374442513917464, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003472789622253923 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05370114479085305, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012337413746734015 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06977608287770509, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001374956676465801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31368077194090394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004624757562111944 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10679593308879384, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017931240322613966 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06944197659188452, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001414266115795263 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30880609903141404, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004454929611901481 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1058735384538628, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001807570321110346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..82ac727acf4c6ea4d352c839e05499e44a80f74e --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06498870832691263, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010723261619156347 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.09208329086891458, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0014107679511711905 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.070437989859743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0010312902668252013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0031259330192821917, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0001792020567470567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0041562592064827136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0002654234746028053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0032754676506289828, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00018463100249520333 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.058599346602679785, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008924974727175551 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.08428848354699885, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001258671638105778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0638808755302006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008731247333910541 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06184900174439773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001011647200269575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.0878197954616511, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013420548769352185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.06702516127871343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009693030073720331 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.0489790785966099, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.006779339030578523 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..79cea2df4fdb4123ae37d4187db84da3f717f647 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07527407433133869, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012353167292243439 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.11879070038037605, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001666703743793988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.08534345009310793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012180416276689969 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0037334174812980337, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00022288691443581962 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.005917125245253962, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003984864275118756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004199685382427501, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002477120467452383 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.06809342794848382, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010182876153051106 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.11003512325420707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0014770539784070803 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0779496715247138, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00102035551714523 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07054606286211522, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001139963924498013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.11204758740190628, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0015572733289150008 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.08013661978467537, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011236586288798575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.19435530388407438, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.027474135486464886 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..62984599af4f0bc3c2b763a7b7ac207967b465dc --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.08188916742697006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012604740470487253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.12863501946937672, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0017306118753458995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.09264326558315315, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012341782878536826 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.00417689519312021, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00027799524088823336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0070394381668830155, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.000549533007866243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.004799135265562465, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003258383358742087 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07469229492018686, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010462849454463273 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.11996588099996168, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0015431639618236344 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.08530978440523501, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010419970840251892 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07637785471073269, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011637595201347114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.12067856570386956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0016119906969829974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.08652135177894667, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011347677035999774 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.26166447349576716, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02664531207793444 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..12e371ffee81ab4b346097574dbaff3538bd6b8d --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07918559281464797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015732102907079421 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.11713311231879246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0018694365480156567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.08496018846964883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0013534144382857925 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.005450855204647191, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00042451344437272563 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.00786801734624306, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005418684959080103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.005617198716065193, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003504250975971709 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.07101336043004314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001347924853763445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.10774352434928229, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016488819148997246 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.07686453493425864, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0011252346058277117 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.07334057105853452, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00146993475075726 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.108988272664584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0017377582372509691 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.07875700339703082, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0012486260433925756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.35676801434287403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04322442294335073 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..07155d1c16218534439bcc840325e0245639f10c --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.030329065224279812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001330278405910414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.04517297214250825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001758818657887946 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.03139283202672829, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0012003402420825678 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002783763831604278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003242150810229513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0044964737985171525, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005138289413575867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0029260675465866874, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00030134314126436455 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.02657200435528566, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011035850530213228 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.04083946585151625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001540836819231046 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.02785801817001536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0010086311227078628 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.027847356181606008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012301635576539068 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.041473623387033184, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0016278414776434958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.028747368698632404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0010989439749250103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.1251055397122511, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.014484092743724803 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a724adc221e260f34f3d5d664657e4b2ea4c256b --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.00546467811919465, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0006252571429574801 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.007588094977815991, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0007854778326407263 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.005312217579045015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0005447805558351684 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.00044339189228553947, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 9.961401108641653e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0006655482990875535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0001814575634003083 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0004573993622119604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 9.93009701272525e-05 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.00461981647245416, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0005033099668854365 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.006788021218410852, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0006881154075077389 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0045916498026018614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0004489570697479766 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.004946550238006405, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0005649935199743343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.007016292908843859, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0007337348506316745 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.004852566252508574, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0004994751682757694 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.608658724355054e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 4.1367178740967643e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..fb81bbaaab7a811a2b4d9d82810603063a954b30 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.06113312558172764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.01575172189851365 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.014710592817317623, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0003529315721245975 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.026426487712639557, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0006726265968105224 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.018526574610266466, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0004491583645956695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.0004867368803311285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 7.864693694964923e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.0009789004289938045, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0001645546445630159 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.0006359667540750598, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0001041733500992725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.014689240988132256, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00035074403880002916 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.026390402516371297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0006695145672545542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.018499794220557017, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00044650709340406276 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.013873433046158122, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0003117470202763751 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.024938658187947462, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0005998309209761277 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.017471035147790994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0003971392931020627 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..acdce7e49616e6fb19c22d763b6d2beeb8d5977d --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.8201208498355745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07918003623517482 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.25691930916492756, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002584112631865945 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4396823419994151, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002924006390564013 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.2978972679324119, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.001938270939643692 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10986735481075849, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016737706226266926 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18795573844856492, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020345189689034695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12463514115806919, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001299783869728999 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2108434140927592, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0021087197045539206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3655838393962915, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0025164390500460707 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.24464314098146767, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014611191185898375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2157653393823411, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0024335105491588797 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36407468797098697, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026761965498774284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2473804112744828, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017953159637369525 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c750d3f5b216886d417b3d8fd529c360b031f230 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.8213920736753035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09269849650267996 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.29394430959429885, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029349960483685197 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.44889829988946167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026008346109833076 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3268772449979945, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002037185940562139 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1343978103369073, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019313519405930251 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.20303674980966369, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019154842806783233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1463066245452882, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014390553252013872 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2299825363481501, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0021765504098611448 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36308738357631903, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023177876330520717 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.25892225643390154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00149376807780425 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.24820013238470168, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0027165241805849048 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.37474754473838195, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024667758421094027 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2738908855936707, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019355452082937584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..780cba7792d663e21cd39bd5850a1b484222485b --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.942009450714699, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.04290812234950382 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.30906162515100283, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0030674030901339717 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.44080702325897825, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026014559705604258 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.33114374289634996, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019963064576782263 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.14318958988373076, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0020155780018855157 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2013842488461, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019010681475652355 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.14999298186863755, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014464336948624026 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.24226611193261477, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002287690467824282 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35660966290964163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002307251432723998 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.26265901338393766, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014797491179515284 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.26123456663930006, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0028608851643456765 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3668591988145458, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0024536597226958376 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.27712766233037806, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019244037043805748 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4b7526713436261dfacc56090aa9911d519e519c --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.50519306139533, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12141535391721209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3382900801328086, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031870177107722864 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.43823825667258115, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0025150913221159833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.35014858130045096, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020798805965393467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1562658891246859, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002090506730559318 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.199427975848365, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018891086525461194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1583408367591379, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015305939174917813 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.25544052575853743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0023438446152810907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.34203035767325535, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022562591465260075 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.26748399521207994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001534553168600716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2842748733336647, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002916261382644648 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36471610019410694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002385680406071139 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2924671515205407, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001988178356101477 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5118f10cea15e22cea6642725139d3190a229773 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 8.909934598498245, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11355686792329017 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.37588510027488736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0028779178551135293 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.43674914985883706, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024638439091214433 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3791546852434602, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020200693077103015 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.17238666817345458, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019278848044138694 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19980355257879728, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018510324254879687 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1726081857656977, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015716694288982242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2762695240145616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0021890512735288354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3283605962966383, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002209924055940966 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28073901470903295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001614330726795435 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3186956691262859, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0026612676205351017 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36923270586040546, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023640716126389163 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.32085059046180675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019675465538199436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..81a4bb9c9e6f74fbc0486317cb9769e5dbeb1e8f --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.07192798210581976, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0010917008658073372 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.17738230504867103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0023612645642943995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.10102973537775274, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0014355102738807643 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.004288158761745872, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002700287405913071 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.010847911832547128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0006940494564969847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.006073255625355645, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003809878915466881 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.061860157155504046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0008875877382502058 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.15379181870207026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0019969008216420697 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.08708997150339674, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0011768137916529835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.061491345289910605, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0009161557912449834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.15291182401919987, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0020815555284572295 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.08656040328674863, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.001216757422923546 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.06212804417145812, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.011971318709799528 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..059d57a073cfcf4ad3628b0de9754b777e8a5685 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06505394396909397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0010345000379844767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.16367528098007583, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.002310164884000985 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.09194088936961643, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0013783104926480619 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.004211301109017673, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002712127686911097 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.010640558564493393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0006925012884316564 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.005968104627521815, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003824456236105915 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.05824585599255965, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0008851968023125635 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.14716783464641015, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0020015127545039207 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.08242443142898405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0011823524939069798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.05601648748687626, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008514365416564917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.14220135901573897, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.001984633594195078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.07935310191197463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0011428193509701132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.07413849235928043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.01976351303187912 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..52b142cd87ab11532cd67d5f3b40ccb6911afeae --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.05970144726407935, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0010808131881698626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.15032918128991743, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0025278263617417337 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.08444066513722136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0014657483966925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.004734886121730911, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00030986275466350464 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.012150770146827796, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008496354899205403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00672052705909708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0004405404886083871 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0545565395708172, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0009399168505439712 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.13773639312565078, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0022148411509370623 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.07722118221070728, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0012763559941414496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.05216833775619172, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008929985372574541 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.1322606270465344, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.002157181329873677 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.07391840483447609, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012190936937503734 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.10304318645626095, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.02590406105837305 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d9feada194e612fe6f920b56e0f02ccd3bac294d --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.05815946761165747, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001380812200134281 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.13549250184452813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0027208611626966957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.07896539889705549, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0016540196570312491 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0051386914240500875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00040605626704506864 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01205042518636505, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0008745757698404825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.006943244163002554, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0005154856926939987 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.05358406774115243, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0011716163642417996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.1260775001189485, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0023957295090012844 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.07314249899760741, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0014324655345679602 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.050377295082503465, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0011611779272879762 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.11809508568654702, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0022806675190111193 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.06851629797589617, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0013679563501104437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.22427754093829802, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.034552127103896886 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..12e9b0b1480d29b701bf55f5745db7d60025d7b2 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.02214933884629678, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0016410873850719364 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.03772298255486349, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0023501327317525427 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.024599325309196316, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0015226073238554963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0020692193883074857, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0003062075285410719 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.004199075477211875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0006094344180167119 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0026139422307253986, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00037040229047969556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.020330750956918043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001451030088641699 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.03484763177326402, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0020977393818014333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.022646617016988464, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0013512159194685597 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.01900606349570929, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014097333664542056 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.031891578587321945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0019321769969979038 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.020880117459677255, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0012684304431225134 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.20252486202824954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0451733186712267 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d4898e763b3d7dbe777e5958896f3907e9e03e88 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/generation/slim.4b284b12bc4seed4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0019620307874586742, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0005451730856114239 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0017379629055225318, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0004813113588013356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0018102133465432942, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0004995256780505414 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00010762452493862039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 7.62180455261845e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0001072041166380789, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 7.577221663047183e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00010730891049598514, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 7.588337415201644e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0017313225130562591, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0004677255640486146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0015468066285396214, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004190027142908825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0016046615232249279, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004323354423624825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0017313225130562591, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0004677255640486146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0015468066285396214, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004190027142908825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0016046615232249279, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0004323354423624825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.6165965257139506e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 3.022588684192216e-34 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b12bc4seed4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..1fdca79dcc42f6d3f1ddc40bbd9fe86a56faf18d --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.337,0.014955087918653603,0 +anli_r3,acc,0.35583333333333333,0.013826518748493307,0 +arc_challenge,acc,0.2687713310580205,0.012955065963710688,0 +arc_challenge,acc_norm,0.295221843003413,0.013329750293382318,0 +arc_easy,acc,0.5576599326599326,0.010191334444220856,0 +arc_easy,acc_norm,0.5046296296296297,0.01025934370588972,0 +boolq,acc,0.5990825688073395,0.008571628711617,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.28917120387174833,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.46813383788090024,0.004979637330230307,0 +hellaswag,acc_norm,0.6135232025492929,0.004859467984155278,0 +piqa,acc,0.7437431991294886,0.010185787831565062,0 +piqa,acc_norm,0.750272034820457,0.010099232969867472,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.829,0.011912216456264613,0 +sciq,acc_norm,0.758,0.013550631705555961,0 +storycloze_2016,acc,0.694815606627472,0.010648664383985656,0 +winogrande,acc,0.5698500394632992,0.0139146850947167,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0_lm-eval_global_step80108_2023-02-24-21-34-52_0shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0_lm-eval_global_step80108_2023-02-24-21-34-52_0shots_backup.json deleted file mode 100644 index 79e2c5227b2e5302a1bbf7bf7867ced9c75cf654..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_0_lm-eval_global_step80108_2023-02-24-21-34-52_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653603 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.013826518748493307 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.28917120387174833 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.46813383788090024, - "acc_stderr": 0.004979637330230307, - "acc_norm": 0.6135232025492929, - "acc_norm_stderr": 0.004859467984155278 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5698500394632992, - "acc_stderr": 0.0139146850947167 - }, - "storycloze_2016": { - "acc": 0.694815606627472, - "acc_stderr": 0.010648664383985656 - }, - "boolq": { - "acc": 0.5990825688073395, - "acc_stderr": 0.008571628711617 - }, - "arc_easy": { - "acc": 0.5576599326599326, - "acc_stderr": 0.010191334444220856, - "acc_norm": 0.5046296296296297, - "acc_norm_stderr": 0.01025934370588972 - }, - "arc_challenge": { - "acc": 0.2687713310580205, - "acc_stderr": 0.012955065963710688, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.013329750293382318 - }, - "sciq": { - "acc": 0.829, - "acc_stderr": 0.011912216456264613, - "acc_norm": 0.758, - "acc_norm_stderr": 0.013550631705555961 - }, - "piqa": { - "acc": 0.7437431991294886, - "acc_stderr": 0.010185787831565062, - "acc_norm": 0.750272034820457, - "acc_norm_stderr": 0.010099232969867472 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..8b186d9d7e5d152225cda76a86a389eda086207c --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.015050266127564446,0 +anli_r2,acc,0.354,0.015129868238451772,0 +anli_r3,acc,0.3475,0.013751753243291852,0 +arc_challenge,acc,0.2773037542662116,0.013082095839059374,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053059,0 +arc_easy,acc,0.5824915824915825,0.010119187377776033,0 +arc_easy,acc_norm,0.5340909090909091,0.010235908103438688,0 +boolq,acc,0.609480122324159,0.008532845556631464,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.2957839262187088,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.4673371838279227,0.004979123236507981,0 +hellaswag,acc_norm,0.6169089822744473,0.004851466623601435,0 +piqa,acc,0.7529923830250272,0.010062268140772627,0 +piqa,acc_norm,0.7676822633297062,0.009853201384168241,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.84,0.011598902298689005,0 +sciq,acc_norm,0.777,0.013169830843425665,0 +storycloze_2016,acc,0.7028327097808659,0.01056831334579161,0 +winogrande,acc,0.5785319652722968,0.013878072377497603,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1_lm-eval_global_step80108_2023-02-24-21-34-52_1shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1_lm-eval_global_step80108_2023-02-24-21-34-52_1shots_backup.json deleted file mode 100644 index 68bed1a5e3fc77a3d8b62e15c6d67b55bb99de05..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_1_lm-eval_global_step80108_2023-02-24-21-34-52_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.015050266127564446 - }, - "anli_r2": { - "acc": 0.354, - "acc_stderr": 0.015129868238451772 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291852 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.2957839262187088 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.4673371838279227, - "acc_stderr": 0.004979123236507981, - "acc_norm": 0.6169089822744473, - "acc_norm_stderr": 0.004851466623601435 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5785319652722968, - "acc_stderr": 0.013878072377497603 - }, - "storycloze_2016": { - "acc": 0.7028327097808659, - "acc_stderr": 0.01056831334579161 - }, - "boolq": { - "acc": 0.609480122324159, - "acc_stderr": 0.008532845556631464 - }, - "arc_easy": { - "acc": 0.5824915824915825, - "acc_stderr": 0.010119187377776033, - "acc_norm": 0.5340909090909091, - "acc_norm_stderr": 0.010235908103438688 - }, - "arc_challenge": { - "acc": 0.2773037542662116, - "acc_stderr": 0.013082095839059374, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053059 - }, - "sciq": { - "acc": 0.84, - "acc_stderr": 0.011598902298689005, - "acc_norm": 0.777, - "acc_norm_stderr": 0.013169830843425665 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772627, - "acc_norm": 0.7676822633297062, - "acc_norm_stderr": 0.009853201384168241 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..5ba4c2ba441ef1dc308c67ff3b7a8d8b0c4f0b19 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.353,0.01512017260548369,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.3675,0.01392352968535928,0 +arc_challenge,acc,0.27559726962457337,0.01305716965576184,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053057,0 +arc_easy,acc,0.5888047138047138,0.010096663811817683,0 +arc_easy,acc_norm,0.5521885521885522,0.010203742451111527,0 +boolq,acc,0.6009174311926605,0.008565077958836785,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.2736754276178557,,1 +copa,acc,0.8,0.04020151261036844,0 +hellaswag,acc,0.4690300736904999,0.004980200451851678,0 +hellaswag,acc_norm,0.6172077275443139,0.004850748687859918,0 +piqa,acc,0.7513601741022851,0.010084511234296857,0 +piqa,acc_norm,0.7611534276387377,0.009948120385337484,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.847,0.011389500459665533,0 +sciq,acc_norm,0.793,0.012818553557843986,0 +storycloze_2016,acc,0.7028327097808659,0.01056831334579161,0 +winogrande,acc,0.5761641673243884,0.013888492389944516,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2_lm-eval_global_step80108_2023-02-24-21-34-52_2shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2_lm-eval_global_step80108_2023-02-24-21-34-52_2shots_backup.json deleted file mode 100644 index e2200004eeb6ce4ee945fbfc2edfa43cabef6776..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_2_lm-eval_global_step80108_2023-02-24-21-34-52_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.353, - "acc_stderr": 0.01512017260548369 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r3": { - "acc": 0.3675, - "acc_stderr": 0.01392352968535928 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.2736754276178557 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036844 - }, - "hellaswag": { - "acc": 0.4690300736904999, - "acc_stderr": 0.004980200451851678, - "acc_norm": 0.6172077275443139, - "acc_norm_stderr": 0.004850748687859918 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5761641673243884, - "acc_stderr": 0.013888492389944516 - }, - "storycloze_2016": { - "acc": 0.7028327097808659, - "acc_stderr": 0.01056831334579161 - }, - "boolq": { - "acc": 0.6009174311926605, - "acc_stderr": 0.008565077958836785 - }, - "arc_easy": { - "acc": 0.5888047138047138, - "acc_stderr": 0.010096663811817683, - "acc_norm": 0.5521885521885522, - "acc_norm_stderr": 0.010203742451111527 - }, - "arc_challenge": { - "acc": 0.27559726962457337, - "acc_stderr": 0.01305716965576184, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053057 - }, - "sciq": { - "acc": 0.847, - "acc_stderr": 0.011389500459665533, - "acc_norm": 0.793, - "acc_norm_stderr": 0.012818553557843986 - }, - "piqa": { - "acc": 0.7513601741022851, - "acc_stderr": 0.010084511234296857, - "acc_norm": 0.7611534276387377, - "acc_norm_stderr": 0.009948120385337484 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..2e4ed4793661eb4cb5d635b68a42fa50461e8932 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408948,0 +anli_r2,acc,0.354,0.015129868238451773,0 +anli_r3,acc,0.3475,0.013751753243291854,0 +arc_challenge,acc,0.2764505119453925,0.013069662474252425,0 +arc_challenge,acc_norm,0.3046075085324232,0.013449522109932492,0 +arc_easy,acc,0.5896464646464646,0.010093531255765465,0 +arc_easy,acc_norm,0.5484006734006734,0.010211600726405236,0 +boolq,acc,0.6048929663608563,0.008550454248280891,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.3333858888450927,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.46594303923521213,0.00497819289340629,0 +hellaswag,acc_norm,0.6192989444333798,0.004845668799108535,0 +piqa,acc,0.7442872687704026,0.01017869010945986,0 +piqa,acc_norm,0.7557127312295974,0.010024765172284253,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.856,0.01110798754893915,0 +sciq,acc_norm,0.798,0.012702651587655137,0 +storycloze_2016,acc,0.7033671833244255,0.010562819181563219,0 +winogrande,acc,0.5706393054459353,0.01391153749996916,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3_lm-eval_global_step80108_2023-02-24-21-34-52_3shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3_lm-eval_global_step80108_2023-02-24-21-34-52_3shots_backup.json deleted file mode 100644 index cf774fafd67e7c100ea47daf776bd295636a8d69..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_3_lm-eval_global_step80108_2023-02-24-21-34-52_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408948 - }, - "anli_r2": { - "acc": 0.354, - "acc_stderr": 0.015129868238451773 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291854 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.3333858888450927 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.46594303923521213, - "acc_stderr": 0.00497819289340629, - "acc_norm": 0.6192989444333798, - "acc_norm_stderr": 0.004845668799108535 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.01391153749996916 - }, - "storycloze_2016": { - "acc": 0.7033671833244255, - "acc_stderr": 0.010562819181563219 - }, - "boolq": { - "acc": 0.6048929663608563, - "acc_stderr": 0.008550454248280891 - }, - "arc_easy": { - "acc": 0.5896464646464646, - "acc_stderr": 0.010093531255765465, - "acc_norm": 0.5484006734006734, - "acc_norm_stderr": 0.010211600726405236 - }, - "arc_challenge": { - "acc": 0.2764505119453925, - "acc_stderr": 0.013069662474252425, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.013449522109932492 - }, - "sciq": { - "acc": 0.856, - "acc_stderr": 0.01110798754893915, - "acc_norm": 0.798, - "acc_norm_stderr": 0.012702651587655137 - }, - "piqa": { - "acc": 0.7442872687704026, - "acc_stderr": 0.01017869010945986, - "acc_norm": 0.7557127312295974, - "acc_norm_stderr": 0.010024765172284253 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..92d234fb68a282d9d94e276ec5cfdef07308c0ef --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.349,0.015080663991563098,0 +anli_r2,acc,0.353,0.015120172605483687,0 +anli_r3,acc,0.3408333333333333,0.013688600793296936,0 +arc_challenge,acc,0.27047781569965873,0.012980954547659554,0 +arc_challenge,acc_norm,0.30631399317406144,0.013470584417276511,0 +arc_easy,acc,0.5816498316498316,0.01012206147074285,0 +arc_easy,acc_norm,0.5526094276094277,0.01020283238541565,0 +boolq,acc,0.6085626911314985,0.00853643052440395,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.26129032258064516,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4671380203146783,0.004978992721242829,0 +hellaswag,acc_norm,0.6201951802429795,0.004843462545943485,0 +piqa,acc,0.7529923830250272,0.010062268140772629,0 +piqa,acc_norm,0.7622415669205659,0.009932525779525494,0 +rte,acc,0.5270758122743683,0.0300523034631437,0 +sciq,acc,0.851,0.011266140684632178,0 +sciq,acc_norm,0.802,0.012607733934175313,0 +storycloze_2016,acc,0.706574024585783,0.01052948933474447,0 +winogrande,acc,0.574585635359116,0.013895257666646378,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4_lm-eval_global_step80108_2023-02-24-21-34-52_4shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4_lm-eval_global_step80108_2023-02-24-21-34-52_4shots_backup.json deleted file mode 100644 index d7d81bf46230fbf4014382b33813dd71d90c5048..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_4_lm-eval_global_step80108_2023-02-24-21-34-52_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.349, - "acc_stderr": 0.015080663991563098 - }, - "anli_r2": { - "acc": 0.353, - "acc_stderr": 0.015120172605483687 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.013688600793296936 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.26129032258064516 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4671380203146783, - "acc_stderr": 0.004978992721242829, - "acc_norm": 0.6201951802429795, - "acc_norm_stderr": 0.004843462545943485 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.0300523034631437 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646378 - }, - "storycloze_2016": { - "acc": 0.706574024585783, - "acc_stderr": 0.01052948933474447 - }, - "boolq": { - "acc": 0.6085626911314985, - "acc_stderr": 0.00853643052440395 - }, - "arc_easy": { - "acc": 0.5816498316498316, - "acc_stderr": 0.01012206147074285, - "acc_norm": 0.5526094276094277, - "acc_norm_stderr": 0.01020283238541565 - }, - "arc_challenge": { - "acc": 0.27047781569965873, - "acc_stderr": 0.012980954547659554, - "acc_norm": 0.30631399317406144, - "acc_norm_stderr": 0.013470584417276511 - }, - "sciq": { - "acc": 0.851, - "acc_stderr": 0.011266140684632178, - "acc_norm": 0.802, - "acc_norm_stderr": 0.012607733934175313 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772629, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525494 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5.csv b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..f2d5d69551c1e0f6514764d7b387f53a318d34a9 --- /dev/null +++ b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.351,0.015100563798316405,0 +anli_r2,acc,0.351,0.015100563798316402,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.28924914675767915,0.013250012579393443,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.5829124579124579,0.010117738967781988,0 +arc_easy,acc_norm,0.5441919191919192,0.010219631763437851,0 +boolq,acc,0.6073394495412844,0.008541161248702914,1 +cb,acc,0.44642857142857145,0.06703189227942397,1 +cb,f1,0.30392518764611787,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.46853216490738897,0.004979889597551664,0 +hellaswag,acc_norm,0.6222863971320454,0.00483824641078626,0 +piqa,acc,0.7475516866158868,0.010135665547362364,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858917,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.848,0.011358918303475275,0 +sciq,acc_norm,0.8,0.01265543994336665,0 +storycloze_2016,acc,0.7055050774986639,0.010540668963800296,0 +winogrande,acc,0.5840568271507498,0.013852485356798259,0 diff --git a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5_lm-eval_global_step80108_2023-02-24-21-34-52_5shots_backup.json b/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5_lm-eval_global_step80108_2023-02-24-21-34-52_5shots_backup.json deleted file mode 100644 index 307d41a0645efdb0b14e9778f13673d37cac876e..0000000000000000000000000000000000000000 --- a/4b284b12bc4seed4/evaluation/rankeval/4b284b12bc4seed4_5_lm-eval_global_step80108_2023-02-24-21-34-52_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.351, - "acc_stderr": 0.015100563798316405 - }, - "anli_r2": { - "acc": 0.351, - "acc_stderr": 0.015100563798316402 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942397, - "f1": 0.30392518764611787 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.46853216490738897, - "acc_stderr": 0.004979889597551664, - "acc_norm": 0.6222863971320454, - "acc_norm_stderr": 0.00483824641078626 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5840568271507498, - "acc_stderr": 0.013852485356798259 - }, - "storycloze_2016": { - "acc": 0.7055050774986639, - "acc_stderr": 0.010540668963800296 - }, - "boolq": { - "acc": 0.6073394495412844, - "acc_stderr": 0.008541161248702914 - }, - "arc_easy": { - "acc": 0.5829124579124579, - "acc_stderr": 0.010117738967781988, - "acc_norm": 0.5441919191919192, - "acc_norm_stderr": 0.010219631763437851 - }, - "arc_challenge": { - "acc": 0.28924914675767915, - "acc_stderr": 0.013250012579393443, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - }, - "sciq": { - "acc": 0.848, - "acc_stderr": 0.011358918303475275, - "acc_norm": 0.8, - "acc_norm_stderr": 0.01265543994336665 - }, - "piqa": { - "acc": 0.7475516866158868, - "acc_stderr": 0.010135665547362364, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858917 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/generation/merged.csv b/4b284b17bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..6a5fd7af2157c54cb9967000afea58111a709f4d --- /dev/null +++ b/4b284b17bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.008169134941790867 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.008169134941790867 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1723284719323317 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1723284719323317 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1877463274388085 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1877463274388085 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.1965472622348377 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.1965472622348377 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19496951545711436 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19496951545711436 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19029346353882015 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19029346353882015 +e2e_nlg_cleaned,5,average,multiple,0.1583423625906172 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.049663336738937365 +gem_xsum,0,median,rouge2_fmeasure,0.049663336738937365 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03655839749230803 +gem_xsum,1,median,rouge2_fmeasure,0.03655839749230803 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.037993467492829024 +gem_xsum,2,median,rouge2_fmeasure,0.037993467492829024 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.0418696855155091 +gem_xsum,3,median,rouge2_fmeasure,0.0418696855155091 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010389278018093686 +gem_xsum,4,median,rouge2_fmeasure,0.010389278018093686 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00043206158301881207 +gem_xsum,5,median,rouge2_fmeasure,0.00043206158301881207 +gem_xsum,5,average,multiple,0.029484371140116 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.052467944710660325 +web_nlg_en,0,median,rouge2_fmeasure,0.052467944710660325 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05269993626863907 +web_nlg_en,1,median,rouge2_fmeasure,0.05269993626863907 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05310933897261573 +web_nlg_en,2,median,rouge2_fmeasure,0.05310933897261573 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.0545990877071863 +web_nlg_en,3,median,rouge2_fmeasure,0.0545990877071863 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05673629789733436 +web_nlg_en,4,median,rouge2_fmeasure,0.05673629789733436 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05716985509173004 +web_nlg_en,5,median,rouge2_fmeasure,0.05716985509173004 +web_nlg_en,5,average,multiple,0.05446374344136097 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03144881649306534 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03144881649306534 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04792367232693977 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04792367232693977 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05111590452574008 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05111590452574008 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04046018099620164 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04046018099620164 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012506081428575835 +wiki_lingua_en,4,median,rouge2_fmeasure,0.012506081428575835 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.001834359089625422 +wiki_lingua_en,5,median,rouge2_fmeasure,0.001834359089625422 +wiki_lingua_en,5,average,multiple,0.030881502476691348 diff --git a/4b284b17bc4seed1/evaluation/generation/merged.json b/4b284b17bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..adbd59b514d0f806c47e645900b13f68bff2f5df --- /dev/null +++ b/4b284b17bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.35460578896365613, "bleu_stderr": 0.04127239985505294, "rouge1_fmeasure": 0.11046047475367161, "rouge1_fmeasure_stderr": 0.0022175469970283947, "rouge1_precision": 0.07598634034268477, "rouge1_precision_stderr": 0.0020593257538953696, "rouge1_recall": 0.30331494602005443, "rouge1_recall_stderr": 0.005045821872290179, "rouge2_fmeasure": 0.052467944710660325, "rouge2_fmeasure_stderr": 0.0013579106932935785, "rouge2_precision": 0.03614080750642198, "rouge2_precision_stderr": 0.0013003604067484368, "rouge2_recall": 0.14679475394818947, "rouge2_recall_stderr": 0.0033169349191795395, "rougeL_fmeasure": 0.10587025011107204, "rougeL_fmeasure_stderr": 0.002057999260024394, "rougeL_precision": 0.07261501210208798, "rougeL_precision_stderr": 0.001929042340123457, "rougeL_recall": 0.29378141049090273, "rougeL_recall_stderr": 0.004903476880957363, "rougeLsum_fmeasure": 0.10544926366442316, "rougeLsum_fmeasure_stderr": 0.0020870180908242673, "rougeLsum_precision": 0.07253602455249049, "rougeLsum_precision_stderr": 0.001960212317107411, "rougeLsum_recall": 0.28972075705368405, "rougeLsum_recall_stderr": 0.004765941005197029}}, "1": {"PALM_prompt": {"bleu": 0.43000649556861953, "bleu_stderr": 0.03769296256723733, "rouge1_fmeasure": 0.11423577207775937, "rouge1_fmeasure_stderr": 0.0019694511711788217, "rouge1_precision": 0.07449267528933835, "rouge1_precision_stderr": 0.0015591243035150693, "rouge1_recall": 0.3570734232135794, "rouge1_recall_stderr": 0.005049147159370153, "rouge2_fmeasure": 0.05269993626863907, "rouge2_fmeasure_stderr": 0.0012130143492016113, "rouge2_precision": 0.03447632020892676, "rouge2_precision_stderr": 0.0009515126663085263, "rouge2_recall": 0.16931911049070325, "rouge2_recall_stderr": 0.0034730603938979677, "rougeL_fmeasure": 0.10772696325239679, "rougeL_fmeasure_stderr": 0.0018050301374967716, "rougeL_precision": 0.07024660297627622, "rougeL_precision_stderr": 0.0014320687118875425, "rougeL_recall": 0.33400063927103313, "rougeL_recall_stderr": 0.004557935006444777, "rougeLsum_fmeasure": 0.10853509777552768, "rougeLsum_fmeasure_stderr": 0.001862736778219524, "rougeLsum_precision": 0.07087429190816584, "rougeLsum_precision_stderr": 0.0014847845278820806, "rougeLsum_recall": 0.33807210476530836, "rougeLsum_recall_stderr": 0.004700621881301265}}, "2": {"PALM_prompt": {"bleu": 0.46635237695480414, "bleu_stderr": 0.036668809623062, "rouge1_fmeasure": 0.11557506829814276, "rouge1_fmeasure_stderr": 0.001924033280192195, "rouge1_precision": 0.07418652948466994, "rouge1_precision_stderr": 0.0014193439744244487, "rouge1_recall": 0.36556492135686, "rouge1_recall_stderr": 0.0050312007549606534, "rouge2_fmeasure": 0.05310933897261573, "rouge2_fmeasure_stderr": 0.0011935945329990892, "rouge2_precision": 0.033828374294314886, "rouge2_precision_stderr": 0.0008631483412359538, "rouge2_recall": 0.1755206924620156, "rouge2_recall_stderr": 0.0035430702160321806, "rougeL_fmeasure": 0.10811633325240383, "rougeL_fmeasure_stderr": 0.0017651025504977517, "rougeL_precision": 0.06937350612818097, "rougeL_precision_stderr": 0.001298396116933123, "rougeL_recall": 0.3401896184445853, "rougeL_recall_stderr": 0.004572939700685839, "rougeLsum_fmeasure": 0.10996153625412339, "rougeLsum_fmeasure_stderr": 0.001828648876946442, "rougeLsum_precision": 0.07060788755354988, "rougeLsum_precision_stderr": 0.0013492078636814708, "rougeLsum_recall": 0.3466944175988772, "rougeLsum_recall_stderr": 0.004718601773324721}}, "3": {"PALM_prompt": {"bleu": 0.5318880573063225, "bleu_stderr": 0.03077006985866061, "rouge1_fmeasure": 0.1187681526614185, "rouge1_fmeasure_stderr": 0.0018389245831829989, "rouge1_precision": 0.07572199540644148, "rouge1_precision_stderr": 0.0013725787893078976, "rouge1_recall": 0.386186194255226, "rouge1_recall_stderr": 0.0050631478485505975, "rouge2_fmeasure": 0.0545990877071863, "rouge2_fmeasure_stderr": 0.0011418202566802825, "rouge2_precision": 0.034691337373678494, "rouge2_precision_stderr": 0.0008101010333279721, "rouge2_recall": 0.18672789069562837, "rouge2_recall_stderr": 0.0036362158555646932, "rougeL_fmeasure": 0.11094586486203455, "rougeL_fmeasure_stderr": 0.0016863362576384658, "rougeL_precision": 0.07075495815573636, "rougeL_precision_stderr": 0.0012595872580060065, "rougeL_recall": 0.35859989642125883, "rougeL_recall_stderr": 0.00457458234745712, "rougeLsum_fmeasure": 0.11279487606503577, "rougeLsum_fmeasure_stderr": 0.001740345273722019, "rougeLsum_precision": 0.07195645133988224, "rougeLsum_precision_stderr": 0.0013011441014847468, "rougeLsum_recall": 0.3655473787152986, "rougeLsum_recall_stderr": 0.004697139221840445}}, "4": {"PALM_prompt": {"bleu": 0.5501744822594039, "bleu_stderr": 0.03409130654156849, "rouge1_fmeasure": 0.12205117106283064, "rouge1_fmeasure_stderr": 0.0018267167297263492, "rouge1_precision": 0.0773953741957407, "rouge1_precision_stderr": 0.0013204324907578587, "rouge1_recall": 0.39710159332805806, "rouge1_recall_stderr": 0.00504803274078318, "rouge2_fmeasure": 0.05673629789733436, "rouge2_fmeasure_stderr": 0.001131466888576198, "rouge2_precision": 0.03574892990415424, "rouge2_precision_stderr": 0.0007844605741780433, "rouge2_recall": 0.1966788736461859, "rouge2_recall_stderr": 0.003719036109492947, "rougeL_fmeasure": 0.11359102901387463, "rougeL_fmeasure_stderr": 0.0016628480882232698, "rougeL_precision": 0.07208379137252659, "rougeL_precision_stderr": 0.0012045521412790883, "rougeL_recall": 0.3679102688951241, "rougeL_recall_stderr": 0.004541973385497078, "rougeLsum_fmeasure": 0.1156616432799891, "rougeLsum_fmeasure_stderr": 0.0017252677961420559, "rougeLsum_precision": 0.07340824717593364, "rougeLsum_precision_stderr": 0.0012507702589347408, "rougeLsum_recall": 0.3754732283837251, "rougeLsum_recall_stderr": 0.004681001943001544}}, "5": {"PALM_prompt": {"bleu": 0.6069653948528808, "bleu_stderr": 0.022478797894781258, "rouge1_fmeasure": 0.12361532318566361, "rouge1_fmeasure_stderr": 0.0018017937208595991, "rouge1_precision": 0.07833603783865227, "rouge1_precision_stderr": 0.001352405348891179, "rouge1_recall": 0.41502730709431096, "rouge1_recall_stderr": 0.005094647215628354, "rouge2_fmeasure": 0.05716985509173004, "rouge2_fmeasure_stderr": 0.0011074288762205806, "rouge2_precision": 0.03600950877265579, "rouge2_precision_stderr": 0.0007884596657334906, "rouge2_recall": 0.20573962541606136, "rouge2_recall_stderr": 0.0037497370778539845, "rougeL_fmeasure": 0.11344042224284119, "rougeL_fmeasure_stderr": 0.0016155314972144464, "rougeL_precision": 0.07196876600610892, "rougeL_precision_stderr": 0.001224924971314539, "rougeL_recall": 0.37833069025228927, "rougeL_recall_stderr": 0.004495187818982786, "rougeLsum_fmeasure": 0.1168897795571233, "rougeLsum_fmeasure_stderr": 0.0017011725978752143, "rougeLsum_precision": 0.07417079754442318, "rougeLsum_precision_stderr": 0.0012856883253605687, "rougeLsum_recall": 0.39062928440269434, "rougeLsum_recall_stderr": 0.004704501383036899}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.390853464486237, "bleu_stderr": 0.059853799172008365, "rouge1_fmeasure": 0.16517157703306565, "rouge1_fmeasure_stderr": 0.0018149558416127082, "rouge1_precision": 0.1430757560611486, "rouge1_precision_stderr": 0.0018512962398626901, "rouge1_recall": 0.23637136153102328, "rouge1_recall_stderr": 0.0025951414839324363, "rouge2_fmeasure": 0.03144881649306534, "rouge2_fmeasure_stderr": 0.0007949045821369007, "rouge2_precision": 0.027253137929797694, "rouge2_precision_stderr": 0.0007565671384632661, "rouge2_recall": 0.046319655637173676, "rouge2_recall_stderr": 0.0013027510329844502, "rougeL_fmeasure": 0.12939552425065465, "rougeL_fmeasure_stderr": 0.0013070901034852402, "rougeL_precision": 0.11076533487865345, "rougeL_precision_stderr": 0.0013126275934646958, "rougeL_recall": 0.18971611018232362, "rougeL_recall_stderr": 0.0020959074049994044, "rougeLsum_fmeasure": 0.151441460006642, "rougeLsum_fmeasure_stderr": 0.0016465261311395824, "rougeLsum_precision": 0.13092618224064537, "rougeLsum_precision_stderr": 0.0016810775282002177, "rougeLsum_recall": 0.21779579893664588, "rougeLsum_recall_stderr": 0.0023948902348789936}}, "1": {"tldr_en": {"bleu": 2.3020814552441036, "bleu_stderr": 0.07124494518745983, "rouge1_fmeasure": 0.20774908941497858, "rouge1_fmeasure_stderr": 0.0019527184699241, "rouge1_precision": 0.17852734604158793, "rouge1_precision_stderr": 0.0020710995740373522, "rouge1_recall": 0.30264596885910944, "rouge1_recall_stderr": 0.00291873292100196, "rouge2_fmeasure": 0.04792367232693977, "rouge2_fmeasure_stderr": 0.000971523171352038, "rouge2_precision": 0.04099710309933377, "rouge2_precision_stderr": 0.000910818770381285, "rouge2_recall": 0.07279917781718397, "rouge2_recall_stderr": 0.0016547259631172991, "rougeL_fmeasure": 0.14759799820832575, "rougeL_fmeasure_stderr": 0.0012896913962158606, "rougeL_precision": 0.12547942347426674, "rougeL_precision_stderr": 0.0013525118104691794, "rougeL_recall": 0.22057999445178014, "rougeL_recall_stderr": 0.0022414540337101223, "rougeLsum_fmeasure": 0.19350686245390553, "rougeLsum_fmeasure_stderr": 0.001814216671457794, "rougeLsum_precision": 0.16603857159214477, "rougeLsum_precision_stderr": 0.001918454683011944, "rougeLsum_recall": 0.2826463785548646, "rougeLsum_recall_stderr": 0.0027435570236691792}}, "2": {"tldr_en": {"bleu": 2.56061946074879, "bleu_stderr": 0.08875901983481134, "rouge1_fmeasure": 0.21298028389245655, "rouge1_fmeasure_stderr": 0.0018566335384653636, "rouge1_precision": 0.1832669676031301, "rouge1_precision_stderr": 0.0020160001989551132, "rouge1_recall": 0.31044482930358086, "rouge1_recall_stderr": 0.002782637370104287, "rouge2_fmeasure": 0.05111590452574008, "rouge2_fmeasure_stderr": 0.000984436891043789, "rouge2_precision": 0.04359936792401392, "rouge2_precision_stderr": 0.0009012688991148481, "rouge2_recall": 0.07746108584334276, "rouge2_recall_stderr": 0.0016970078202864917, "rougeL_fmeasure": 0.1515450545072009, "rougeL_fmeasure_stderr": 0.0012442172141465177, "rougeL_precision": 0.1290928592055247, "rougeL_precision_stderr": 0.0013251005584623103, "rougeL_recall": 0.2265125598092535, "rougeL_recall_stderr": 0.0022015069669254045, "rougeLsum_fmeasure": 0.1997192575381439, "rougeLsum_fmeasure_stderr": 0.0017345981527878763, "rougeLsum_precision": 0.17157223010778608, "rougeLsum_precision_stderr": 0.0018727495126747784, "rougeLsum_recall": 0.29190060671500806, "rougeLsum_recall_stderr": 0.0026378499445263008}}, "3": {"tldr_en": {"bleu": 2.4567620772859544, "bleu_stderr": 0.07804801426146958, "rouge1_fmeasure": 0.1734250515140521, "rouge1_fmeasure_stderr": 0.0021585230764058403, "rouge1_precision": 0.15365556799869134, "rouge1_precision_stderr": 0.0022375644528785585, "rouge1_recall": 0.2520250438413879, "rouge1_recall_stderr": 0.003280462869976282, "rouge2_fmeasure": 0.04046018099620164, "rouge2_fmeasure_stderr": 0.0009368491430766626, "rouge2_precision": 0.035304417807292626, "rouge2_precision_stderr": 0.0008933122056836016, "rouge2_recall": 0.06145996914352228, "rouge2_recall_stderr": 0.0015943752876392376, "rougeL_fmeasure": 0.12482706202216799, "rougeL_fmeasure_stderr": 0.0015071076159934608, "rougeL_precision": 0.11014823619866325, "rougeL_precision_stderr": 0.0015873396520271518, "rougeL_recall": 0.1857322798596749, "rougeL_recall_stderr": 0.002553313928784032, "rougeLsum_fmeasure": 0.16261984079091396, "rougeLsum_fmeasure_stderr": 0.00201669945503749, "rougeLsum_precision": 0.14400230894616803, "rougeLsum_precision_stderr": 0.002097114189887866, "rougeLsum_recall": 0.23691973770859226, "rougeLsum_recall_stderr": 0.003108576444854855}}, "4": {"tldr_en": {"bleu": 0.5686192864139299, "bleu_stderr": 0.05329872239897294, "rouge1_fmeasure": 0.05514449811895782, "rouge1_fmeasure_stderr": 0.0018661361529205558, "rouge1_precision": 0.050446112431908365, "rouge1_precision_stderr": 0.0018621346986588624, "rouge1_recall": 0.08296121662582322, "rouge1_recall_stderr": 0.0028815435632905994, "rouge2_fmeasure": 0.012506081428575835, "rouge2_fmeasure_stderr": 0.000617828942630335, "rouge2_precision": 0.010827044508083692, "rouge2_precision_stderr": 0.000584691320512827, "rouge2_recall": 0.019907513701632813, "rouge2_recall_stderr": 0.0010930514612001662, "rougeL_fmeasure": 0.04114530708158443, "rougeL_fmeasure_stderr": 0.0013718968214232637, "rougeL_precision": 0.0378964094729927, "rougeL_precision_stderr": 0.001419123037289574, "rougeL_recall": 0.06320549575577322, "rougeL_recall_stderr": 0.0022530025801340424, "rougeLsum_fmeasure": 0.05161513337559881, "rougeLsum_fmeasure_stderr": 0.0017417092171012594, "rougeLsum_precision": 0.047261454636276935, "rougeLsum_precision_stderr": 0.001749098938810562, "rougeLsum_recall": 0.07790933996710452, "rougeLsum_recall_stderr": 0.0027202426274044768}}, "5": {"tldr_en": {"bleu": 1.4978632294934352e-06, "bleu_stderr": 2.8426588022690166e-06, "rouge1_fmeasure": 0.008723812047284148, "rouge1_fmeasure_stderr": 0.0008193557654359252, "rouge1_precision": 0.00816218714291048, "rouge1_precision_stderr": 0.0008225175706365002, "rouge1_recall": 0.013376777535733507, "rouge1_recall_stderr": 0.0012945755094527528, "rouge2_fmeasure": 0.001834359089625422, "rouge2_fmeasure_stderr": 0.00023179541880762196, "rouge2_precision": 0.0016392883778666894, "rouge2_precision_stderr": 0.00022497812116211926, "rouge2_recall": 0.0030504312623576067, "rouge2_recall_stderr": 0.0004174116396029455, "rougeL_fmeasure": 0.00647027468483912, "rougeL_fmeasure_stderr": 0.0005992276725051343, "rougeL_precision": 0.006061858268899674, "rougeL_precision_stderr": 0.000608364835519215, "rougeL_recall": 0.01021577852815356, "rougeL_recall_stderr": 0.0010105057014620934, "rougeLsum_fmeasure": 0.00812179216223168, "rougeLsum_fmeasure_stderr": 0.0007615460927384592, "rougeLsum_precision": 0.007575733113414468, "rougeLsum_precision_stderr": 0.0007607557971305472, "rougeLsum_recall": 0.012573851281871638, "rougeLsum_recall_stderr": 0.001225769033813259}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.5430652885343026, "bleu_stderr": 0.022082484632507872, "rouge1_fmeasure": 0.07741361857983127, "rouge1_fmeasure_stderr": 0.001132924636873198, "rouge1_precision": 0.058690510706937433, "rouge1_precision_stderr": 0.0009708303148239041, "rouge1_recall": 0.12537197958835283, "rouge1_recall_stderr": 0.0018746074102243583, "rouge2_fmeasure": 0.008169134941790867, "rouge2_fmeasure_stderr": 0.00047672897974732135, "rouge2_precision": 0.006081743671322657, "rouge2_precision_stderr": 0.0003657111764040069, "rouge2_recall": 0.013546950620169113, "rouge2_recall_stderr": 0.0007812479279203251, "rougeL_fmeasure": 0.07639604693417318, "rougeL_fmeasure_stderr": 0.0010809118409461501, "rougeL_precision": 0.057686950703333045, "rougeL_precision_stderr": 0.0008896823845173719, "rougeL_recall": 0.12403635392847258, "rougeL_recall_stderr": 0.0018233766742210078, "rougeLsum_fmeasure": 0.06358108342639292, "rougeLsum_fmeasure_stderr": 0.0009407364363846421, "rougeLsum_precision": 0.04821257551275573, "rougeLsum_precision_stderr": 0.000810755418294155, "rougeLsum_recall": 0.10309655754134595, "rougeLsum_recall_stderr": 0.0015536256465239371}}, "1": {"generate_text_restaurant": {"bleu": 9.021355379582062, "bleu_stderr": 0.10623953202209324, "rouge1_fmeasure": 0.4055777436674024, "rouge1_fmeasure_stderr": 0.0020513543480466046, "rouge1_precision": 0.40793804080711205, "rouge1_precision_stderr": 0.0025241462455489836, "rouge1_recall": 0.4447838396302669, "rouge1_recall_stderr": 0.002843720569909992, "rouge2_fmeasure": 0.1723284719323317, "rouge2_fmeasure_stderr": 0.0017077073940088953, "rouge2_precision": 0.17321636216395442, "rouge2_precision_stderr": 0.0018751370046392074, "rouge2_recall": 0.19070657240665465, "rouge2_recall_stderr": 0.0020935532982887437, "rougeL_fmeasure": 0.2881503807490911, "rougeL_fmeasure_stderr": 0.0016750738518425616, "rougeL_precision": 0.2902723436144399, "rougeL_precision_stderr": 0.0020485949228574835, "rougeL_recall": 0.3170307319809301, "rougeL_recall_stderr": 0.0022979256302620267, "rougeLsum_fmeasure": 0.33767965579292886, "rougeLsum_fmeasure_stderr": 0.0019918613599406863, "rougeLsum_precision": 0.3398038056283089, "rougeLsum_precision_stderr": 0.0023494605618611625, "rougeLsum_recall": 0.37004877183005896, "rougeLsum_recall_stderr": 0.0026282083685719676}}, "2": {"generate_text_restaurant": {"bleu": 10.113940674814147, "bleu_stderr": 0.14158653466380353, "rouge1_fmeasure": 0.4192695997412943, "rouge1_fmeasure_stderr": 0.001956593030198661, "rouge1_precision": 0.42169201018178765, "rouge1_precision_stderr": 0.0024591821051573844, "rouge1_recall": 0.4553757907208419, "rouge1_recall_stderr": 0.002762586151947058, "rouge2_fmeasure": 0.1877463274388085, "rouge2_fmeasure_stderr": 0.0017292126115718328, "rouge2_precision": 0.18860533474575017, "rouge2_precision_stderr": 0.0019093113876152922, "rouge2_recall": 0.20618277051281303, "rouge2_recall_stderr": 0.002147489885830656, "rougeL_fmeasure": 0.3031593987471526, "rougeL_fmeasure_stderr": 0.0017185441047347237, "rougeL_precision": 0.3049199738981857, "rougeL_precision_stderr": 0.002060649727261783, "rougeL_recall": 0.3301213216090321, "rougeL_recall_stderr": 0.0023449418849790346, "rougeLsum_fmeasure": 0.34937362258056176, "rougeLsum_fmeasure_stderr": 0.0019377724085507641, "rougeLsum_precision": 0.3508806465899511, "rougeLsum_precision_stderr": 0.0022749935402855376, "rougeLsum_recall": 0.37984919905452064, "rougeLsum_recall_stderr": 0.002603638405911715}}, "3": {"generate_text_restaurant": {"bleu": 10.8201389431134, "bleu_stderr": 0.14308976771282733, "rouge1_fmeasure": 0.42425148245558203, "rouge1_fmeasure_stderr": 0.001970717028195558, "rouge1_precision": 0.4270119331576845, "rouge1_precision_stderr": 0.0024841694273249355, "rouge1_recall": 0.4595201825615637, "rouge1_recall_stderr": 0.0027434642248067836, "rouge2_fmeasure": 0.1965472622348377, "rouge2_fmeasure_stderr": 0.0017674561951196886, "rouge2_precision": 0.1980135179853101, "rouge2_precision_stderr": 0.001958609184896891, "rouge2_recall": 0.21515925688103615, "rouge2_recall_stderr": 0.0021936934119952973, "rougeL_fmeasure": 0.31077211552436274, "rougeL_fmeasure_stderr": 0.0017645823005779276, "rougeL_precision": 0.313027569485227, "rougeL_precision_stderr": 0.002150997784044619, "rougeL_recall": 0.33729693598184624, "rougeL_recall_stderr": 0.002349898786076012, "rougeLsum_fmeasure": 0.35667974526063834, "rougeLsum_fmeasure_stderr": 0.0019855982113182236, "rougeLsum_precision": 0.35862892839542776, "rougeLsum_precision_stderr": 0.0023574734411340495, "rougeLsum_recall": 0.3868880675862601, "rougeLsum_recall_stderr": 0.002629274635035275}}, "4": {"generate_text_restaurant": {"bleu": 10.748006102366846, "bleu_stderr": 0.20587131675493173, "rouge1_fmeasure": 0.4193816174833963, "rouge1_fmeasure_stderr": 0.0019534021605399406, "rouge1_precision": 0.4203153115235523, "rouge1_precision_stderr": 0.0025761947742893496, "rouge1_recall": 0.45817921600162986, "rouge1_recall_stderr": 0.0026979219990255587, "rouge2_fmeasure": 0.19496951545711436, "rouge2_fmeasure_stderr": 0.0017675695646951184, "rouge2_precision": 0.19569900813831625, "rouge2_precision_stderr": 0.0019749036772695933, "rouge2_recall": 0.21498311497796316, "rouge2_recall_stderr": 0.002187842079829162, "rougeL_fmeasure": 0.30944368265364386, "rougeL_fmeasure_stderr": 0.001760799097861244, "rougeL_precision": 0.3098963653292, "rougeL_precision_stderr": 0.002187376726727452, "rougeL_recall": 0.3392067881509787, "rougeL_recall_stderr": 0.002363806042112853, "rougeLsum_fmeasure": 0.35483471453855586, "rougeLsum_fmeasure_stderr": 0.0019769772638384736, "rougeLsum_precision": 0.3548498145433127, "rougeLsum_precision_stderr": 0.002404468701475204, "rougeLsum_recall": 0.3885822037053069, "rougeLsum_recall_stderr": 0.0026339915778194056}}, "5": {"generate_text_restaurant": {"bleu": 10.07029280392025, "bleu_stderr": 0.16820417246403815, "rouge1_fmeasure": 0.4120744202773343, "rouge1_fmeasure_stderr": 0.0019268013859534442, "rouge1_precision": 0.3997424443126998, "rouge1_precision_stderr": 0.002509076491945065, "rouge1_recall": 0.46532662291450666, "rouge1_recall_stderr": 0.00264363774082294, "rouge2_fmeasure": 0.19029346353882015, "rouge2_fmeasure_stderr": 0.0017688208079494707, "rouge2_precision": 0.18469396343654157, "rouge2_precision_stderr": 0.0019364037617764882, "rouge2_recall": 0.21700435263229634, "rouge2_recall_stderr": 0.002190664608271055, "rougeL_fmeasure": 0.30540353731105185, "rougeL_fmeasure_stderr": 0.0017767777362628493, "rougeL_precision": 0.2961306858200834, "rougeL_precision_stderr": 0.0021584129425405406, "rougeL_recall": 0.34594887959254145, "rougeL_recall_stderr": 0.002372231185418221, "rougeLsum_fmeasure": 0.35216926515870645, "rougeLsum_fmeasure_stderr": 0.001966241705609662, "rougeLsum_precision": 0.34111556606843185, "rougeLsum_precision_stderr": 0.0023637597564672504, "rougeLsum_recall": 0.39828555999095006, "rougeLsum_recall_stderr": 0.002611728081083023}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1262594378676067, "bleu_stderr": 0.095134222030361, "rouge1_fmeasure": 0.2146540097271338, "rouge1_fmeasure_stderr": 0.0026889142432873487, "rouge1_precision": 0.17140703214052655, "rouge1_precision_stderr": 0.002657385766704294, "rouge1_recall": 0.3328727910689603, "rouge1_recall_stderr": 0.004478215733101042, "rouge2_fmeasure": 0.049663336738937365, "rouge2_fmeasure_stderr": 0.0017642714115669107, "rouge2_precision": 0.03900330994819724, "rouge2_precision_stderr": 0.001528495657116988, "rouge2_recall": 0.08030726522172472, "rouge2_recall_stderr": 0.002887852065979978, "rougeL_fmeasure": 0.16038878419096453, "rougeL_fmeasure_stderr": 0.0021363642948066205, "rougeL_precision": 0.12794926942490142, "rougeL_precision_stderr": 0.0021041803528192753, "rougeL_recall": 0.24989761131083427, "rougeL_recall_stderr": 0.00358408703157254, "rougeLsum_fmeasure": 0.16814747869602256, "rougeLsum_fmeasure_stderr": 0.0022974302463237696, "rougeLsum_precision": 0.1335666168529187, "rougeLsum_precision_stderr": 0.0021728880500778438, "rougeLsum_recall": 0.2632275074866455, "rougeLsum_recall_stderr": 0.003953396990275494}}, "1": {"article_DOC_summary": {"bleu": 1.4483377477041561, "bleu_stderr": 0.0840030327291259, "rouge1_fmeasure": 0.1790918098017675, "rouge1_fmeasure_stderr": 0.0025132906259886117, "rouge1_precision": 0.12762495343099467, "rouge1_precision_stderr": 0.0018780124957160557, "rouge1_recall": 0.313413092533033, "rouge1_recall_stderr": 0.004306518419134443, "rouge2_fmeasure": 0.03655839749230803, "rouge2_fmeasure_stderr": 0.0014038700307555834, "rouge2_precision": 0.025830285615912763, "rouge2_precision_stderr": 0.0009915971356563732, "rouge2_recall": 0.06570288742608651, "rouge2_recall_stderr": 0.00264662461319788, "rougeL_fmeasure": 0.1388485940184289, "rougeL_fmeasure_stderr": 0.0018751083682789276, "rougeL_precision": 0.09873873361045128, "rougeL_precision_stderr": 0.0013888491496605799, "rougeL_recall": 0.2446195811573636, "rougeL_recall_stderr": 0.0033615248260746096, "rougeLsum_fmeasure": 0.14333789882498887, "rougeLsum_fmeasure_stderr": 0.002102831086991533, "rougeLsum_precision": 0.10194750437982684, "rougeLsum_precision_stderr": 0.0015522863811899836, "rougeLsum_recall": 0.252360107505834, "rougeLsum_recall_stderr": 0.0037266545345916377}}, "2": {"article_DOC_summary": {"bleu": 1.4098334620416249, "bleu_stderr": 0.10251781578887398, "rouge1_fmeasure": 0.18318008255527174, "rouge1_fmeasure_stderr": 0.0024878335268235297, "rouge1_precision": 0.13039220364032736, "rouge1_precision_stderr": 0.0018489941105405477, "rouge1_recall": 0.3208338991546151, "rouge1_recall_stderr": 0.004297695481110425, "rouge2_fmeasure": 0.037993467492829024, "rouge2_fmeasure_stderr": 0.0013560650853370311, "rouge2_precision": 0.026781308730490817, "rouge2_precision_stderr": 0.0009577167465939763, "rouge2_recall": 0.06824095591764738, "rouge2_recall_stderr": 0.0025179645513505435, "rougeL_fmeasure": 0.1433277343773715, "rougeL_fmeasure_stderr": 0.0018719617109376372, "rougeL_precision": 0.10184190144496406, "rougeL_precision_stderr": 0.001381528660279364, "rougeL_recall": 0.25250694238777793, "rougeL_recall_stderr": 0.003364599776037821, "rougeLsum_fmeasure": 0.1459671345211713, "rougeLsum_fmeasure_stderr": 0.002081513626929764, "rougeLsum_precision": 0.10364570773812086, "rougeLsum_precision_stderr": 0.0015257796013009766, "rougeLsum_recall": 0.25744273875109214, "rougeLsum_recall_stderr": 0.0037263830864618396}}, "3": {"article_DOC_summary": {"bleu": 1.7655055241804707, "bleu_stderr": 0.06477912123560818, "rouge1_fmeasure": 0.1823286909702188, "rouge1_fmeasure_stderr": 0.0027172739032140136, "rouge1_precision": 0.13260082146223712, "rouge1_precision_stderr": 0.0020837715361515315, "rouge1_recall": 0.31417364649267365, "rouge1_recall_stderr": 0.004766326484185003, "rouge2_fmeasure": 0.0418696855155091, "rouge2_fmeasure_stderr": 0.001561764077187165, "rouge2_precision": 0.029860167938620372, "rouge2_precision_stderr": 0.001119618909474985, "rouge2_recall": 0.0744234434951538, "rouge2_recall_stderr": 0.002838973765382053, "rougeL_fmeasure": 0.14378532654364076, "rougeL_fmeasure_stderr": 0.0020641586964029695, "rougeL_precision": 0.1044912013394587, "rougeL_precision_stderr": 0.0015885738089965284, "rougeL_recall": 0.24881332740973316, "rougeL_recall_stderr": 0.00370494003948016, "rougeLsum_fmeasure": 0.145758065422687, "rougeLsum_fmeasure_stderr": 0.0022897023386280607, "rougeLsum_precision": 0.10592669923558432, "rougeLsum_precision_stderr": 0.0017438522893054556, "rougeLsum_recall": 0.2524237548781368, "rougeLsum_recall_stderr": 0.004105527632225445}}, "4": {"article_DOC_summary": {"bleu": 0.7281725414747328, "bleu_stderr": 0.1260106543272375, "rouge1_fmeasure": 0.04910213359418778, "rouge1_fmeasure_stderr": 0.002744537374746729, "rouge1_precision": 0.041688643979342556, "rouge1_precision_stderr": 0.002601042149795104, "rouge1_recall": 0.07735095250848614, "rouge1_recall_stderr": 0.004456764254802236, "rouge2_fmeasure": 0.010389278018093686, "rouge2_fmeasure_stderr": 0.000885603435222816, "rouge2_precision": 0.007962003061308665, "rouge2_precision_stderr": 0.000712056980108108, "rouge2_recall": 0.01755275162918786, "rouge2_recall_stderr": 0.0015467145554586869, "rougeL_fmeasure": 0.03783956544850372, "rougeL_fmeasure_stderr": 0.0021016204447309272, "rougeL_precision": 0.03254096549229946, "rougeL_precision_stderr": 0.002113524713833357, "rougeL_recall": 0.05986377270001221, "rougeL_recall_stderr": 0.0034321169053043903, "rougeLsum_fmeasure": 0.039609121778678015, "rougeLsum_fmeasure_stderr": 0.0022308134808697523, "rougeLsum_precision": 0.0339927160449478, "rougeLsum_precision_stderr": 0.0022040533285906335, "rougeLsum_recall": 0.06260373695298717, "rougeLsum_recall_stderr": 0.003653558913424135}}, "5": {"article_DOC_summary": {"bleu": 1.2076944953786812e-38, "bleu_stderr": 9.41695631660932e-34, "rouge1_fmeasure": 0.002427323172932295, "rouge1_fmeasure_stderr": 0.0006885605705564534, "rouge1_precision": 0.0027081548968779534, "rouge1_precision_stderr": 0.000769629501128022, "rouge1_recall": 0.0022854665614044562, "rouge1_recall_stderr": 0.0006527724974423966, "rouge2_fmeasure": 0.00043206158301881207, "rouge2_fmeasure_stderr": 0.00019988234207854035, "rouge2_precision": 0.00048226606911401585, "rouge2_precision_stderr": 0.00021993176645429774, "rouge2_recall": 0.0003963716930303764, "rouge2_recall_stderr": 0.00018610435903420472, "rougeL_fmeasure": 0.001903925076413495, "rougeL_fmeasure_stderr": 0.0005355714845904816, "rougeL_precision": 0.002098042355577843, "rougeL_precision_stderr": 0.0005877770180975632, "rougeL_recall": 0.001821815264382321, "rougeL_recall_stderr": 0.000523325114699533, "rougeLsum_fmeasure": 0.001994261659228993, "rougeLsum_fmeasure_stderr": 0.0005516448655133461, "rougeLsum_precision": 0.0022069481248609703, "rougeLsum_precision_stderr": 0.0006101246103421121, "rougeLsum_recall": 0.0018999298389345458, "rougeLsum_recall_stderr": 0.0005353725566945594}}}} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..c8e96d2dce87db3148a05a9165ac9e2df0005787 --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928366,0 +anli_r2,acc,0.329,0.014865395385928364,0 +anli_r3,acc,0.3458333333333333,0.013736245342311012,0 +arc_challenge,acc,0.2627986348122867,0.012862523175351333,0 +arc_challenge,acc_norm,0.28242320819112626,0.013155456884097222,0 +arc_easy,acc,0.5652356902356902,0.010172083670402784,0 +arc_easy,acc_norm,0.5130471380471381,0.01025628992505844,0 +boolq,acc,0.6241590214067279,0.008471147248160112,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.1940928270042194,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.45429197371041624,0.00496888813029007,0 +hellaswag,acc_norm,0.5936068512248556,0.004901558132335521,0 +piqa,acc,0.7372143634385201,0.010269354068140767,0 +piqa,acc_norm,0.7459194776931447,0.010157271999135051,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.826,0.011994493230973428,0 +sciq,acc_norm,0.726,0.014111099288259588,0 +storycloze_2016,acc,0.7108498129342598,0.010484068799942079,0 +winogrande,acc,0.5619573796369376,0.013944181296470804,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json deleted file mode 100644 index 35d3f81fa415a0773095ed1eac7798b807cfa514..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928366 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928364 - }, - "anli_r3": { - "acc": 0.3458333333333333, - "acc_stderr": 0.013736245342311012 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.1940928270042194 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.45429197371041624, - "acc_stderr": 0.00496888813029007, - "acc_norm": 0.5936068512248556, - "acc_norm_stderr": 0.004901558132335521 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5619573796369376, - "acc_stderr": 0.013944181296470804 - }, - "storycloze_2016": { - "acc": 0.7108498129342598, - "acc_stderr": 0.010484068799942079 - }, - "boolq": { - "acc": 0.6241590214067279, - "acc_stderr": 0.008471147248160112 - }, - "arc_easy": { - "acc": 0.5652356902356902, - "acc_stderr": 0.010172083670402784, - "acc_norm": 0.5130471380471381, - "acc_norm_stderr": 0.01025628992505844 - }, - "arc_challenge": { - "acc": 0.2627986348122867, - "acc_stderr": 0.012862523175351333, - "acc_norm": 0.28242320819112626, - "acc_norm_stderr": 0.013155456884097222 - }, - "sciq": { - "acc": 0.826, - "acc_stderr": 0.011994493230973428, - "acc_norm": 0.726, - "acc_norm_stderr": 0.014111099288259588 - }, - "piqa": { - "acc": 0.7372143634385201, - "acc_stderr": 0.010269354068140767, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.010157271999135051 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..85471339326b190b03f2bf1a1b7185f8d04f410c --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.341,0.0149981313484027,0 +anli_r2,acc,0.328,0.014853842487270336,0 +anli_r3,acc,0.3425,0.013704669762934725,0 +arc_challenge,acc,0.2713310580204778,0.0129938077275458,0 +arc_challenge,acc_norm,0.29436860068259385,0.013318528460539424,0 +arc_easy,acc,0.5917508417508418,0.010085566195791248,0 +arc_easy,acc_norm,0.5517676767676768,0.010204645126856942,0 +boolq,acc,0.6192660550458715,0.008492625561656213,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.24603174603174607,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.45140410276837284,0.004966158142645419,0 +hellaswag,acc_norm,0.5957976498705437,0.004897340793314379,0 +piqa,acc,0.7410228509249184,0.01022096603140561,0 +piqa,acc_norm,0.749727965179543,0.01010656188008977,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.877,0.010391293421849874,0 +sciq,acc_norm,0.83,0.01188449583454167,0 +storycloze_2016,acc,0.6958845537145911,0.010638172655194796,0 +winogrande,acc,0.5706393054459353,0.013911537499969165,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json deleted file mode 100644 index d395cbdf7e342daa7c2ca03a3528a30c5ebc0135..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r2": { - "acc": 0.328, - "acc_stderr": 0.014853842487270336 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934725 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.24603174603174607 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.45140410276837284, - "acc_stderr": 0.004966158142645419, - "acc_norm": 0.5957976498705437, - "acc_norm_stderr": 0.004897340793314379 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969165 - }, - "storycloze_2016": { - "acc": 0.6958845537145911, - "acc_stderr": 0.010638172655194796 - }, - "boolq": { - "acc": 0.6192660550458715, - "acc_stderr": 0.008492625561656213 - }, - "arc_easy": { - "acc": 0.5917508417508418, - "acc_stderr": 0.010085566195791248, - "acc_norm": 0.5517676767676768, - "acc_norm_stderr": 0.010204645126856942 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.0129938077275458, - "acc_norm": 0.29436860068259385, - "acc_norm_stderr": 0.013318528460539424 - }, - "sciq": { - "acc": 0.877, - "acc_stderr": 0.010391293421849874, - "acc_norm": 0.83, - "acc_norm_stderr": 0.01188449583454167 - }, - "piqa": { - "acc": 0.7410228509249184, - "acc_stderr": 0.01022096603140561, - "acc_norm": 0.749727965179543, - "acc_norm_stderr": 0.01010656188008977 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..4404b0e64db9f841df45b8d8d253d84a71a951eb --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.014770821817934635,0 +anli_r2,acc,0.334,0.01492201952373296,0 +anli_r3,acc,0.32,0.013471620929769135,0 +arc_challenge,acc,0.27303754266211605,0.01301933276263575,0 +arc_challenge,acc_norm,0.30119453924914674,0.013406741767847626,0 +arc_easy,acc,0.6043771043771043,0.010033741393430986,0 +arc_easy,acc_norm,0.5778619528619529,0.01013462052459227,0 +boolq,acc,0.617737003058104,0.008499149690449272,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.2988943957300801,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.45130452101175067,0.004966060995315068,0 +hellaswag,acc_norm,0.5956980681139216,0.004897534686686327,0 +piqa,acc,0.7377584330794341,0.01026250256517245,0 +piqa,acc_norm,0.7442872687704026,0.01017869010945987,0 +rte,acc,0.516245487364621,0.030080573208738064,0 +sciq,acc,0.896,0.009658016218524305,0 +sciq,acc_norm,0.863,0.010878848714333316,0 +storycloze_2016,acc,0.6980224478888295,0.010616985436073357,0 +winogrande,acc,0.5714285714285714,0.013908353814606686,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json deleted file mode 100644 index 40d51e1255cc9de4f561f1e0be9e7dd847ab5074..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.014770821817934635 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.01492201952373296 - }, - "anli_r3": { - "acc": 0.32, - "acc_stderr": 0.013471620929769135 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.2988943957300801 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.45130452101175067, - "acc_stderr": 0.004966060995315068, - "acc_norm": 0.5956980681139216, - "acc_norm_stderr": 0.004897534686686327 - }, - "rte": { - "acc": 0.516245487364621, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5714285714285714, - "acc_stderr": 0.013908353814606686 - }, - "storycloze_2016": { - "acc": 0.6980224478888295, - "acc_stderr": 0.010616985436073357 - }, - "boolq": { - "acc": 0.617737003058104, - "acc_stderr": 0.008499149690449272 - }, - "arc_easy": { - "acc": 0.6043771043771043, - "acc_stderr": 0.010033741393430986, - "acc_norm": 0.5778619528619529, - "acc_norm_stderr": 0.01013462052459227 - }, - "arc_challenge": { - "acc": 0.27303754266211605, - "acc_stderr": 0.01301933276263575, - "acc_norm": 0.30119453924914674, - "acc_norm_stderr": 0.013406741767847626 - }, - "sciq": { - "acc": 0.896, - "acc_stderr": 0.009658016218524305, - "acc_norm": 0.863, - "acc_norm_stderr": 0.010878848714333316 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.01026250256517245, - "acc_norm": 0.7442872687704026, - "acc_norm_stderr": 0.01017869010945987 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..21ec84bdd54121b05b28efbb33ffb9c0e91483e9 --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.015050266127564438,0 +anli_r2,acc,0.364,0.015222868840522024,0 +anli_r3,acc,0.3325,0.013605417345710526,0 +arc_challenge,acc,0.2832764505119454,0.013167478735134575,0 +arc_challenge,acc_norm,0.3037542662116041,0.01343890918477876,0 +arc_easy,acc,0.5989057239057239,0.010057051106534372,0 +arc_easy,acc_norm,0.5812289562289562,0.010123487160167819,0 +boolq,acc,0.5981651376146789,0.008574857171671134,1 +cb,acc,0.44642857142857145,0.067031892279424,1 +cb,f1,0.31977105885280577,,1 +copa,acc,0.8,0.04020151261036844,0 +hellaswag,acc,0.44851623182632944,0.004963259311700562,0 +hellaswag,acc_norm,0.5903206532563234,0.004907694727935689,0 +piqa,acc,0.7426550598476604,0.01019992106479251,0 +piqa,acc_norm,0.7535364526659413,0.010054810789671811,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.899,0.009533618929341002,0 +sciq,acc_norm,0.872,0.010570133761108658,0 +storycloze_2016,acc,0.706574024585783,0.010529489334744471,0 +winogrande,acc,0.5643251775848461,0.01393570973961571,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json deleted file mode 100644 index 9922d75a28f3ef6180b9e972ccadca8156decdc9..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.015050266127564438 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.015222868840522024 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.013605417345710526 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.067031892279424, - "f1": 0.31977105885280577 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036844 - }, - "hellaswag": { - "acc": 0.44851623182632944, - "acc_stderr": 0.004963259311700562, - "acc_norm": 0.5903206532563234, - "acc_norm_stderr": 0.004907694727935689 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.5643251775848461, - "acc_stderr": 0.01393570973961571 - }, - "storycloze_2016": { - "acc": 0.706574024585783, - "acc_stderr": 0.010529489334744471 - }, - "boolq": { - "acc": 0.5981651376146789, - "acc_stderr": 0.008574857171671134 - }, - "arc_easy": { - "acc": 0.5989057239057239, - "acc_stderr": 0.010057051106534372, - "acc_norm": 0.5812289562289562, - "acc_norm_stderr": 0.010123487160167819 - }, - "arc_challenge": { - "acc": 0.2832764505119454, - "acc_stderr": 0.013167478735134575, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.01343890918477876 - }, - "sciq": { - "acc": 0.899, - "acc_stderr": 0.009533618929341002, - "acc_norm": 0.872, - "acc_norm_stderr": 0.010570133761108658 - }, - "piqa": { - "acc": 0.7426550598476604, - "acc_stderr": 0.01019992106479251, - "acc_norm": 0.7535364526659413, - "acc_norm_stderr": 0.010054810789671811 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..9332330d63bae3569f832bc51cc2ce3b4a981e66 --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.348,0.01507060460376841,0 +anli_r2,acc,0.364,0.01522286884052202,0 +anli_r3,acc,0.33666666666666667,0.013647602942406389,0 +arc_challenge,acc,0.2773037542662116,0.013082095839059374,0 +arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0 +arc_easy,acc,0.6035353535353535,0.010037412763064526,0 +arc_easy,acc_norm,0.5854377104377104,0.010108889212447783,0 +boolq,acc,0.6024464831804281,0.008559523256936824,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2536231884057971,,1 +copa,acc,0.81,0.03942772444036622,0 +hellaswag,acc,0.4493128858793069,0.0049640758701203404,0 +hellaswag,acc_norm,0.5959968133837881,0.0048969523785069215,0 +piqa,acc,0.7383025027203483,0.010255630772708232,0 +piqa,acc_norm,0.7470076169749728,0.01014288869886245,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.902,0.009406619184621236,0 +sciq,acc_norm,0.882,0.01020686926438179,0 +storycloze_2016,acc,0.6990913949759487,0.010606289538707344,0 +winogrande,acc,0.5556432517758485,0.013965196769083553,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json deleted file mode 100644 index 506b233d8626446b9d36f6d6790545f4badd3ec3..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.348, - "acc_stderr": 0.01507060460376841 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.01522286884052202 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406389 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2536231884057971 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036622 - }, - "hellaswag": { - "acc": 0.4493128858793069, - "acc_stderr": 0.0049640758701203404, - "acc_norm": 0.5959968133837881, - "acc_norm_stderr": 0.0048969523785069215 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5556432517758485, - "acc_stderr": 0.013965196769083553 - }, - "storycloze_2016": { - "acc": 0.6990913949759487, - "acc_stderr": 0.010606289538707344 - }, - "boolq": { - "acc": 0.6024464831804281, - "acc_stderr": 0.008559523256936824 - }, - "arc_easy": { - "acc": 0.6035353535353535, - "acc_stderr": 0.010037412763064526, - "acc_norm": 0.5854377104377104, - "acc_norm_stderr": 0.010108889212447783 - }, - "arc_challenge": { - "acc": 0.2773037542662116, - "acc_stderr": 0.013082095839059374, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344003 - }, - "sciq": { - "acc": 0.902, - "acc_stderr": 0.009406619184621236, - "acc_norm": 0.882, - "acc_norm_stderr": 0.01020686926438179 - }, - "piqa": { - "acc": 0.7383025027203483, - "acc_stderr": 0.010255630772708232, - "acc_norm": 0.7470076169749728, - "acc_norm_stderr": 0.01014288869886245 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5.csv b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..3d0964d213245449140aed88808c891a2651effc --- /dev/null +++ b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.345,0.015039986742055237,0 +anli_r2,acc,0.33,0.014876872027456724,0 +anli_r3,acc,0.32666666666666666,0.013544340907003665,0 +arc_challenge,acc,0.2935153583617747,0.01330725044494112,0 +arc_challenge,acc_norm,0.318259385665529,0.013611993916971453,0 +arc_easy,acc,0.6094276094276094,0.010011059112064237,0 +arc_easy,acc_norm,0.5951178451178452,0.0100724239603957,0 +boolq,acc,0.6009174311926605,0.008565077958836783,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.30501089324618735,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4470225054769966,0.004961693567208819,0 +hellaswag,acc_norm,0.5973909579764987,0.004894210011303224,0 +piqa,acc,0.735038084874864,0.010296557993316047,0 +piqa,acc_norm,0.7529923830250272,0.01006226814077264,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.907,0.00918887563499668,0 +sciq,acc_norm,0.886,0.010055103435823332,0 +storycloze_2016,acc,0.6996258685195083,0.010600915927985021,0 +winogrande,acc,0.5572217837411207,0.013960157350784983,0 diff --git a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json b/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json deleted file mode 100644 index 235373655f02ae7275ada64cb2204d7cc76efe29..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed1/evaluation/rankeval/4b284b17bc4seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456724 - }, - "anli_r3": { - "acc": 0.32666666666666666, - "acc_stderr": 0.013544340907003665 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.30501089324618735 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4470225054769966, - "acc_stderr": 0.004961693567208819, - "acc_norm": 0.5973909579764987, - "acc_norm_stderr": 0.004894210011303224 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5572217837411207, - "acc_stderr": 0.013960157350784983 - }, - "storycloze_2016": { - "acc": 0.6996258685195083, - "acc_stderr": 0.010600915927985021 - }, - "boolq": { - "acc": 0.6009174311926605, - "acc_stderr": 0.008565077958836783 - }, - "arc_easy": { - "acc": 0.6094276094276094, - "acc_stderr": 0.010011059112064237, - "acc_norm": 0.5951178451178452, - "acc_norm_stderr": 0.0100724239603957 - }, - "arc_challenge": { - "acc": 0.2935153583617747, - "acc_stderr": 0.01330725044494112, - "acc_norm": 0.318259385665529, - "acc_norm_stderr": 0.013611993916971453 - }, - "sciq": { - "acc": 0.907, - "acc_stderr": 0.00918887563499668, - "acc_norm": 0.886, - "acc_norm_stderr": 0.010055103435823332 - }, - "piqa": { - "acc": 0.735038084874864, - "acc_stderr": 0.010296557993316047, - "acc_norm": 0.7529923830250272, - "acc_norm_stderr": 0.01006226814077264 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-web_nlg_en_PALM_prompt_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_GEM-wiki_lingua_en_tldr_en_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_0.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_1.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_1.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_2.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_2.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_3.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_3.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_4.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_4.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_5.jsonl b/4b284b17bc4seed2/evaluation/generation/examples.4b284b17bc4seed2_gem_xsum_article_DOC_summary_5.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/4b284b17bc4seed2/evaluation/generation/merged.csv b/4b284b17bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..8d4ad2a8389c889b720cdeadb760a63ce6875229 --- /dev/null +++ b/4b284b17bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1 @@ +dataset,fewshots,prompt,metric,value diff --git a/4b284b17bc4seed2/evaluation/generation/merged.json b/4b284b17bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..9e26dfeeb6e641a33dae4961196235bdb965b21b --- /dev/null +++ b/4b284b17bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..799ddc276b80dd84c1cf2e75147fc834d149e32e --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.38814184794485884, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05047480258801663}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.07565066920874082, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0015551905322164353}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3238815837744594, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004766360419359012}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11509783418657757, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020703242193275064}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.035464553016307084, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0009537389454644533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.15704909118734803, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032984157050335527}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05417783468044965, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012997434401208422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.07280668033107564, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014362151271666136}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.315212302235133, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046596548650644975}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.111131126212517, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001936148683137272}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.07237282830314168, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0014666510212682613}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3093807689813474, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004471237151849855}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11011217201582478, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019522649157393596}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a844c3279363e4677137d5640fddb2aab0c18e50 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.15160180317053568, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001846868104763307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.2579236357228536, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002563846709803964}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.17714731434706937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017820831483570008}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.030188268597376115, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007473222285048291}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.05301218199575937, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013343798734673441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03539075879427683, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000818749326693359}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11690294044579747, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012937532849923467}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.20645902517477255, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021105218188585406}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13824591665193203, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0012760816451838966}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13937660458718434, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001680447465334119}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.23852271853532409, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023644372828476343}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.16318126157866722, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001624188114564557}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4721568574077277, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.039649110894939166}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7f98a1288771d399de7e9343ff829dab7a40a748 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/agg.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.1557902081027143, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0022032876149756923}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.3350673880900352, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004415721805932336}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.20470721378981208, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0025226661937158843}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.03445353133744446, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012154256732747522}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0798147637879874, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002874504215431858}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.046452959321020074, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015784279302210721}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.1191156380371497, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001689615531792646}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.2591701921041493, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0035902067326739374}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.15701792133721412, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001947718861021701}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.12234454178265489, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018119505282670214}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.26657728841908296, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.003920062645044777}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.16148501668654708, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002159848508290731}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.9143202589684163, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05580184327046581}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..1447430804d63fbc295f6bd55a4abd33218e9981 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcf512dc85b85c9de6c0840d30233487022206c4e077a5cdf1d6ece3da2a5ef3 +size 4110433 diff --git a/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..344a19101ea75f9f2c0af2c538181c58de447630 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:681b08041e3249ccf077f00b7466fe64995dee4bf8342818389394216774e25f +size 7695751 diff --git a/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.jsonl b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..0cb759d6c235c17ca1caf95d2952a2aa576df883 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/examples.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bc2f574e7f8b60ecf80dd5cb5ac134d1c0259709dc368f8bed58f295ca731b5 +size 2811242 diff --git a/4b284b17bc4seed3/evaluation/generation/merged.csv b/4b284b17bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d6aa552ec019bce1c519ed1a13c28a31d2afecb1 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,10 @@ +dataset,fewshots,prompt,metric,value +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.046452959321020074 +gem_xsum,0,median,rouge2_fmeasure,0.046452959321020074 +gem_xsum,0,average,multiple,0.046452959321020074 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05417783468044965 +web_nlg_en,0,median,rouge2_fmeasure,0.05417783468044965 +web_nlg_en,0,average,multiple,0.05417783468044965 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03539075879427683 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03539075879427683 +wiki_lingua_en,0,average,multiple,0.03539075879427683 diff --git a/4b284b17bc4seed3/evaluation/generation/merged.json b/4b284b17bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..5334b3248aef3fca241b875fab8ae6d6b4a21f64 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.38814184794485884, "bleu_stderr": 0.05047480258801663, "rouge1_fmeasure": 0.11509783418657757, "rouge1_fmeasure_stderr": 0.0020703242193275064, "rouge1_precision": 0.07565066920874082, "rouge1_precision_stderr": 0.0015551905322164353, "rouge1_recall": 0.3238815837744594, "rouge1_recall_stderr": 0.004766360419359012, "rouge2_fmeasure": 0.05417783468044965, "rouge2_fmeasure_stderr": 0.0012997434401208422, "rouge2_precision": 0.035464553016307084, "rouge2_precision_stderr": 0.0009537389454644533, "rouge2_recall": 0.15704909118734803, "rouge2_recall_stderr": 0.0032984157050335527, "rougeL_fmeasure": 0.111131126212517, "rougeL_fmeasure_stderr": 0.001936148683137272, "rougeL_precision": 0.07280668033107564, "rougeL_precision_stderr": 0.0014362151271666136, "rougeL_recall": 0.315212302235133, "rougeL_recall_stderr": 0.0046596548650644975, "rougeLsum_fmeasure": 0.11011217201582478, "rougeLsum_fmeasure_stderr": 0.0019522649157393596, "rougeLsum_precision": 0.07237282830314168, "rougeLsum_precision_stderr": 0.0014666510212682613, "rougeLsum_recall": 0.3093807689813474, "rougeLsum_recall_stderr": 0.004471237151849855}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4721568574077277, "bleu_stderr": 0.039649110894939166, "rouge1_fmeasure": 0.17714731434706937, "rouge1_fmeasure_stderr": 0.0017820831483570008, "rouge1_precision": 0.15160180317053568, "rouge1_precision_stderr": 0.001846868104763307, "rouge1_recall": 0.2579236357228536, "rouge1_recall_stderr": 0.002563846709803964, "rouge2_fmeasure": 0.03539075879427683, "rouge2_fmeasure_stderr": 0.000818749326693359, "rouge2_precision": 0.030188268597376115, "rouge2_precision_stderr": 0.0007473222285048291, "rouge2_recall": 0.05301218199575937, "rouge2_recall_stderr": 0.0013343798734673441, "rougeL_fmeasure": 0.13824591665193203, "rougeL_fmeasure_stderr": 0.0012760816451838966, "rougeL_precision": 0.11690294044579747, "rougeL_precision_stderr": 0.0012937532849923467, "rougeL_recall": 0.20645902517477255, "rougeL_recall_stderr": 0.0021105218188585406, "rougeLsum_fmeasure": 0.16318126157866722, "rougeLsum_fmeasure_stderr": 0.001624188114564557, "rougeLsum_precision": 0.13937660458718434, "rougeLsum_precision_stderr": 0.001680447465334119, "rougeLsum_recall": 0.23852271853532409, "rougeLsum_recall_stderr": 0.0023644372828476343}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9143202589684163, "bleu_stderr": 0.05580184327046581, "rouge1_fmeasure": 0.20470721378981208, "rouge1_fmeasure_stderr": 0.0025226661937158843, "rouge1_precision": 0.1557902081027143, "rouge1_precision_stderr": 0.0022032876149756923, "rouge1_recall": 0.3350673880900352, "rouge1_recall_stderr": 0.004415721805932336, "rouge2_fmeasure": 0.046452959321020074, "rouge2_fmeasure_stderr": 0.0015784279302210721, "rouge2_precision": 0.03445353133744446, "rouge2_precision_stderr": 0.0012154256732747522, "rouge2_recall": 0.0798147637879874, "rouge2_recall_stderr": 0.002874504215431858, "rougeL_fmeasure": 0.15701792133721412, "rougeL_fmeasure_stderr": 0.001947718861021701, "rougeL_precision": 0.1191156380371497, "rougeL_precision_stderr": 0.001689615531792646, "rougeL_recall": 0.2591701921041493, "rougeL_recall_stderr": 0.0035902067326739374, "rougeLsum_fmeasure": 0.16148501668654708, "rougeLsum_fmeasure_stderr": 0.002159848508290731, "rougeLsum_precision": 0.12234454178265489, "rougeLsum_precision_stderr": 0.0018119505282670214, "rougeLsum_recall": 0.26657728841908296, "rougeLsum_recall_stderr": 0.003920062645044777}}}} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..571955ac33b926784e8322e99d32c74039332096 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.38814184794485884, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05047480258801663 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07565066920874082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015551905322164353 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3238815837744594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004766360419359012 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11509783418657757, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020703242193275064 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035464553016307084, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009537389454644533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15704909118734803, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032984157050335527 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05417783468044965, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012997434401208422 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07280668033107564, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014362151271666136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.315212302235133, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046596548650644975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.111131126212517, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001936148683137272 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07237282830314168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014666510212682613 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3093807689813474, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004471237151849855 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11011217201582478, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019522649157393596 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cc46e1fc522867a7490426bcb4250b2f28f344f1 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15160180317053568, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001846868104763307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2579236357228536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002563846709803964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17714731434706937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017820831483570008 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.030188268597376115, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007473222285048291 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05301218199575937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013343798734673441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03539075879427683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000818749326693359 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11690294044579747, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012937532849923467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20645902517477255, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021105218188585406 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13824591665193203, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012760816451838966 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13937660458718434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001680447465334119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23852271853532409, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023644372828476343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16318126157866722, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001624188114564557 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.4721568574077277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.039649110894939166 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..1f96726704ee5725f967fd1cf55252d07dcda511 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/generation/slim.4b284b17bc4seed3_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1557902081027143, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0022032876149756923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3350673880900352, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004415721805932336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.20470721378981208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025226661937158843 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03445353133744446, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012154256732747522 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0798147637879874, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002874504215431858 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.046452959321020074, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015784279302210721 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1191156380371497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001689615531792646 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2591701921041493, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035902067326739374 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15701792133721412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001947718861021701 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.12234454178265489, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018119505282670214 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26657728841908296, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003920062645044777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16148501668654708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002159848508290731 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9143202589684163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05580184327046581 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4seeds/4b284b17bc4seed3/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..d5886f5ed8f79f1d76fb18009e1bc70b8792c8f4 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.333,0.014910846164229859,0 +anli_r2,acc,0.327,0.014842213153411247,0 +anli_r3,acc,0.3308333333333333,0.013588208070709006,0 +arc_challenge,acc,0.26791808873720135,0.012942030195136435,0 +arc_challenge,acc_norm,0.295221843003413,0.013329750293382316,0 +arc_easy,acc,0.5837542087542088,0.010114819404500867,0 +arc_easy,acc_norm,0.5223063973063973,0.010249568404555655,0 +boolq,acc,0.618348623853211,0.008496550741178261,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.3057817998994469,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.47161919936267677,0.004981736689518751,0 +hellaswag,acc_norm,0.6169089822744473,0.004851466623601434,0 +piqa,acc,0.7557127312295974,0.010024765172284253,0 +piqa,acc_norm,0.764417845484222,0.009901067586473883,0 +rte,acc,0.5848375451263538,0.02966006629089348,0 +sciq,acc,0.846,0.011419913065098715,0 +sciq,acc_norm,0.742,0.013842963108656604,0 +storycloze_2016,acc,0.7135221806520577,0.01045510591863303,0 +winogrande,acc,0.5816890292028414,0.013863669961195904,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0_lm-eval_global_step80108_2023-02-24-23-57-50_0shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0_lm-eval_global_step80108_2023-02-24-23-57-50_0shots_backup.json deleted file mode 100644 index 6404bc9f7e1ad74f8cdeda4d1afdffb2968acc20..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_0_lm-eval_global_step80108_2023-02-24-23-57-50_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.333, - "acc_stderr": 0.014910846164229859 - }, - "anli_r2": { - "acc": 0.327, - "acc_stderr": 0.014842213153411247 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709006 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.3057817998994469 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.47161919936267677, - "acc_stderr": 0.004981736689518751, - "acc_norm": 0.6169089822744473, - "acc_norm_stderr": 0.004851466623601434 - }, - "rte": { - "acc": 0.5848375451263538, - "acc_stderr": 0.02966006629089348 - }, - "winogrande": { - "acc": 0.5816890292028414, - "acc_stderr": 0.013863669961195904 - }, - "storycloze_2016": { - "acc": 0.7135221806520577, - "acc_stderr": 0.01045510591863303 - }, - "boolq": { - "acc": 0.618348623853211, - "acc_stderr": 0.008496550741178261 - }, - "arc_easy": { - "acc": 0.5837542087542088, - "acc_stderr": 0.010114819404500867, - "acc_norm": 0.5223063973063973, - "acc_norm_stderr": 0.010249568404555655 - }, - "arc_challenge": { - "acc": 0.26791808873720135, - "acc_stderr": 0.012942030195136435, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.013329750293382316 - }, - "sciq": { - "acc": 0.846, - "acc_stderr": 0.011419913065098715, - "acc_norm": 0.742, - "acc_norm_stderr": 0.013842963108656604 - }, - "piqa": { - "acc": 0.7557127312295974, - "acc_stderr": 0.010024765172284253, - "acc_norm": 0.764417845484222, - "acc_norm_stderr": 0.009901067586473883 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..dab907cd6f96d689c801bd0c6edc854d377e70b1 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270334,0 +anli_r2,acc,0.327,0.014842213153411242,0 +anli_r3,acc,0.35333333333333333,0.013804572162314928,0 +arc_challenge,acc,0.2986348122866894,0.013374078615068752,0 +arc_challenge,acc_norm,0.31399317406143346,0.013562691224726297,0 +arc_easy,acc,0.601010101010101,0.010048240683798755,0 +arc_easy,acc_norm,0.5669191919191919,0.010167478013701792,0 +boolq,acc,0.6143730886850153,0.008513189460768051,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2877899877899878,,1 +copa,acc,0.76,0.04292346959909282,0 +hellaswag,acc,0.4675363473411671,0.004979252954977312,0 +hellaswag,acc_norm,0.614618601872137,0.004856906473719403,0 +piqa,acc,0.750816104461371,0.010091882770120214,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.895,0.00969892102602495,0 +sciq,acc_norm,0.867,0.01074366913239733,0 +storycloze_2016,acc,0.709246392303581,0.010501233625213078,0 +winogrande,acc,0.590370955011839,0.013821049109655476,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1_lm-eval_global_step80108_2023-02-24-23-57-50_1shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1_lm-eval_global_step80108_2023-02-24-23-57-50_1shots_backup.json deleted file mode 100644 index 40f93f9ad2bb8dd4af3719bee54423537b07df94..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_1_lm-eval_global_step80108_2023-02-24-23-57-50_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r2": { - "acc": 0.327, - "acc_stderr": 0.014842213153411242 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314928 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2877899877899878 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909282 - }, - "hellaswag": { - "acc": 0.4675363473411671, - "acc_stderr": 0.004979252954977312, - "acc_norm": 0.614618601872137, - "acc_norm_stderr": 0.004856906473719403 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.590370955011839, - "acc_stderr": 0.013821049109655476 - }, - "storycloze_2016": { - "acc": 0.709246392303581, - "acc_stderr": 0.010501233625213078 - }, - "boolq": { - "acc": 0.6143730886850153, - "acc_stderr": 0.008513189460768051 - }, - "arc_easy": { - "acc": 0.601010101010101, - "acc_stderr": 0.010048240683798755, - "acc_norm": 0.5669191919191919, - "acc_norm_stderr": 0.010167478013701792 - }, - "arc_challenge": { - "acc": 0.2986348122866894, - "acc_stderr": 0.013374078615068752, - "acc_norm": 0.31399317406143346, - "acc_norm_stderr": 0.013562691224726297 - }, - "sciq": { - "acc": 0.895, - "acc_stderr": 0.00969892102602495, - "acc_norm": 0.867, - "acc_norm_stderr": 0.01074366913239733 - }, - "piqa": { - "acc": 0.750816104461371, - "acc_stderr": 0.010091882770120214, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..9d5a351829a75f9daa5d2c7b265e3139ddb51063 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811476,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.3233333333333333,0.01350837286730022,0 +arc_challenge,acc,0.295221843003413,0.013329750293382316,0 +arc_challenge,acc_norm,0.2986348122866894,0.013374078615068744,0 +arc_easy,acc,0.6182659932659933,0.009968648851839668,0 +arc_easy,acc_norm,0.5963804713804713,0.010067368960348226,0 +boolq,acc,0.6168195718654435,0.008503021391450791,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.2989672364672365,,1 +copa,acc,0.8,0.04020151261036845,0 +hellaswag,acc,0.4649472216689902,0.004977504446608999,0 +hellaswag,acc_norm,0.6183031268671579,0.004848099661619672,0 +piqa,acc,0.7584330794341676,0.009986718001804467,0 +piqa,acc_norm,0.7698585418933623,0.009820832826839803,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.917,0.008728527206074794,0 +sciq,acc_norm,0.891,0.009859828407037188,0 +storycloze_2016,acc,0.7103153393907001,0.010489808091946617,0 +winogrande,acc,0.5974743488555643,0.013782866831703044,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2_lm-eval_global_step80108_2023-02-24-23-57-50_2shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2_lm-eval_global_step80108_2023-02-24-23-57-50_2shots_backup.json deleted file mode 100644 index ab6c14d5bf3ae85028b61141005d2f2634890ff1..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_2_lm-eval_global_step80108_2023-02-24-23-57-50_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811476 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.01350837286730022 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.2989672364672365 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036845 - }, - "hellaswag": { - "acc": 0.4649472216689902, - "acc_stderr": 0.004977504446608999, - "acc_norm": 0.6183031268671579, - "acc_norm_stderr": 0.004848099661619672 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.013782866831703044 - }, - "storycloze_2016": { - "acc": 0.7103153393907001, - "acc_stderr": 0.010489808091946617 - }, - "boolq": { - "acc": 0.6168195718654435, - "acc_stderr": 0.008503021391450791 - }, - "arc_easy": { - "acc": 0.6182659932659933, - "acc_stderr": 0.009968648851839668, - "acc_norm": 0.5963804713804713, - "acc_norm_stderr": 0.010067368960348226 - }, - "arc_challenge": { - "acc": 0.295221843003413, - "acc_stderr": 0.013329750293382316, - "acc_norm": 0.2986348122866894, - "acc_norm_stderr": 0.013374078615068744 - }, - "sciq": { - "acc": 0.917, - "acc_stderr": 0.008728527206074794, - "acc_norm": 0.891, - "acc_norm_stderr": 0.009859828407037188 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804467, - "acc_norm": 0.7698585418933623, - "acc_norm_stderr": 0.009820832826839803 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ca2995e3e6a21f1d4ff166ba1715aa3afbb131a8 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738859,0 +anli_r2,acc,0.331,0.014888272588203933,0 +anli_r3,acc,0.32083333333333336,0.013480882752851553,0 +arc_challenge,acc,0.30716723549488056,0.013481034054980945,0 +arc_challenge,acc_norm,0.30802047781569963,0.013491429517292038,0 +arc_easy,acc,0.6241582491582491,0.00993843637317063,0 +arc_easy,acc_norm,0.5997474747474747,0.010053550119896133,0 +boolq,acc,0.6097859327217126,0.008531643526263245,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.34034722536464695,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.46863174666401114,0.004979952166595543,0 +hellaswag,acc_norm,0.6194981079466242,0.00484518003427162,0 +piqa,acc,0.7589771490750816,0.009979042717267314,0 +piqa,acc_norm,0.7595212187159956,0.009971345364651071,0 +rte,acc,0.5451263537906137,0.02997363649541526,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.894,0.009739551265785138,0 +storycloze_2016,acc,0.7129877071084981,0.010460934115933265,0 +winogrande,acc,0.5872138910812944,0.013837060648682089,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3_lm-eval_global_step80108_2023-02-24-23-57-50_3shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3_lm-eval_global_step80108_2023-02-24-23-57-50_3shots_backup.json deleted file mode 100644 index efe45f149b86420a680fc18b5137af01ec77d8b2..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_3_lm-eval_global_step80108_2023-02-24-23-57-50_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738859 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203933 - }, - "anli_r3": { - "acc": 0.32083333333333336, - "acc_stderr": 0.013480882752851553 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.34034722536464695 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.46863174666401114, - "acc_stderr": 0.004979952166595543, - "acc_norm": 0.6194981079466242, - "acc_norm_stderr": 0.00484518003427162 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.02997363649541526 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.013837060648682089 - }, - "storycloze_2016": { - "acc": 0.7129877071084981, - "acc_stderr": 0.010460934115933265 - }, - "boolq": { - "acc": 0.6097859327217126, - "acc_stderr": 0.008531643526263245 - }, - "arc_easy": { - "acc": 0.6241582491582491, - "acc_stderr": 0.00993843637317063, - "acc_norm": 0.5997474747474747, - "acc_norm_stderr": 0.010053550119896133 - }, - "arc_challenge": { - "acc": 0.30716723549488056, - "acc_stderr": 0.013481034054980945, - "acc_norm": 0.30802047781569963, - "acc_norm_stderr": 0.013491429517292038 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.894, - "acc_norm_stderr": 0.009739551265785138 - }, - "piqa": { - "acc": 0.7589771490750816, - "acc_stderr": 0.009979042717267314, - "acc_norm": 0.7595212187159956, - "acc_norm_stderr": 0.009971345364651071 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..45d7f1bbefdf53c04e51bf56cc9df47c9bdbf926 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795023,0 +anli_r2,acc,0.32,0.014758652303574876,0 +anli_r3,acc,0.3491666666666667,0.013767075395077249,0 +arc_challenge,acc,0.29692832764505117,0.013352025976725222,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053057,0 +arc_easy,acc,0.6287878787878788,0.00991359900184574,0 +arc_easy,acc_norm,0.6031144781144782,0.010039236800583199,0 +boolq,acc,0.6165137614678899,0.008504304838837023,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.4400465860102907,,1 +copa,acc,0.81,0.03942772444036622,0 +hellaswag,acc,0.46873132842063336,0.004980014536539821,0 +hellaswag,acc_norm,0.6197968532164907,0.004844445265582643,0 +piqa,acc,0.7551686615886833,0.010032309105568795,0 +piqa,acc_norm,0.7622415669205659,0.009932525779525489,0 +rte,acc,0.555956678700361,0.029907396333795983,0 +sciq,acc,0.908,0.009144376393151105,0 +sciq,acc_norm,0.905,0.009276910103103324,0 +storycloze_2016,acc,0.7145911277391769,0.010443395884062118,0 +winogrande,acc,0.579321231254933,0.013874526372008315,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4_lm-eval_global_step80108_2023-02-24-23-57-43_4shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4_lm-eval_global_step80108_2023-02-24-23-57-43_4shots_backup.json deleted file mode 100644 index 3dea9a2ae8b3ce9e4621fedbcf026ac258ad8bba..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_4_lm-eval_global_step80108_2023-02-24-23-57-43_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795023 - }, - "anli_r2": { - "acc": 0.32, - "acc_stderr": 0.014758652303574876 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077249 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.4400465860102907 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036622 - }, - "hellaswag": { - "acc": 0.46873132842063336, - "acc_stderr": 0.004980014536539821, - "acc_norm": 0.6197968532164907, - "acc_norm_stderr": 0.004844445265582643 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795983 - }, - "winogrande": { - "acc": 0.579321231254933, - "acc_stderr": 0.013874526372008315 - }, - "storycloze_2016": { - "acc": 0.7145911277391769, - "acc_stderr": 0.010443395884062118 - }, - "boolq": { - "acc": 0.6165137614678899, - "acc_stderr": 0.008504304838837023 - }, - "arc_easy": { - "acc": 0.6287878787878788, - "acc_stderr": 0.00991359900184574, - "acc_norm": 0.6031144781144782, - "acc_norm_stderr": 0.010039236800583199 - }, - "arc_challenge": { - "acc": 0.29692832764505117, - "acc_stderr": 0.013352025976725222, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053057 - }, - "sciq": { - "acc": 0.908, - "acc_stderr": 0.009144376393151105, - "acc_norm": 0.905, - "acc_norm_stderr": 0.009276910103103324 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568795, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525489 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5.csv b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..21ec9254a071c6bfd66ca44f6b511c106ebb9d76 --- /dev/null +++ b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.01486539538592836,0 +anli_r2,acc,0.33,0.014876872027456736,0 +anli_r3,acc,0.33416666666666667,0.013622434813136778,0 +arc_challenge,acc,0.2977815699658703,0.013363080107244487,0 +arc_challenge,acc_norm,0.31399317406143346,0.013562691224726295,0 +arc_easy,acc,0.6296296296296297,0.009908978578665758,0 +arc_easy,acc_norm,0.6119528619528619,0.009999295905750669,0 +boolq,acc,0.6159021406727829,0.008506861063860251,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.3130523153057618,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.4673371838279227,0.004979123236507975,0 +hellaswag,acc_norm,0.6237801234813782,0.0048344619979448795,0 +piqa,acc,0.7486398258977149,0.01012115601681926,0 +piqa,acc_norm,0.7611534276387377,0.009948120385337485,0 +rte,acc,0.5270758122743683,0.030052303463143706,0 +sciq,acc,0.918,0.008680515615523727,0 +sciq,acc_norm,0.912,0.008963053962592076,0 +storycloze_2016,acc,0.7183324425440941,0.010401844358587665,0 +winogrande,acc,0.584846093133386,0.013848684086658587,0 diff --git a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5_lm-eval_global_step80108_2023-02-25-00-00-35_5shots_backup.json b/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5_lm-eval_global_step80108_2023-02-25-00-00-35_5shots_backup.json deleted file mode 100644 index 64b0f8301c8c461b66852fd02cb72f95fb2b5744..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed3/evaluation/rankeval/4b284b17bc4seed3_5_lm-eval_global_step80108_2023-02-25-00-00-35_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.01486539538592836 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.014876872027456736 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136778 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.3130523153057618 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4673371838279227, - "acc_stderr": 0.004979123236507975, - "acc_norm": 0.6237801234813782, - "acc_norm_stderr": 0.0048344619979448795 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.584846093133386, - "acc_stderr": 0.013848684086658587 - }, - "storycloze_2016": { - "acc": 0.7183324425440941, - "acc_stderr": 0.010401844358587665 - }, - "boolq": { - "acc": 0.6159021406727829, - "acc_stderr": 0.008506861063860251 - }, - "arc_easy": { - "acc": 0.6296296296296297, - "acc_stderr": 0.009908978578665758, - "acc_norm": 0.6119528619528619, - "acc_norm_stderr": 0.009999295905750669 - }, - "arc_challenge": { - "acc": 0.2977815699658703, - "acc_stderr": 0.013363080107244487, - "acc_norm": 0.31399317406143346, - "acc_norm_stderr": 0.013562691224726295 - }, - "sciq": { - "acc": 0.918, - "acc_stderr": 0.008680515615523727, - "acc_norm": 0.912, - "acc_norm_stderr": 0.008963053962592076 - }, - "piqa": { - "acc": 0.7486398258977149, - "acc_stderr": 0.01012115601681926, - "acc_norm": 0.7611534276387377, - "acc_norm_stderr": 0.009948120385337485 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/generation/merged.csv b/4b284b17bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..979511fb272cf5fe5aaf2c49fe34dbc30c7ae558 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.03966758182207007 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.03966758182207007 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.12345720450902517 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.12345720450902517 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.14193662867850868 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.14193662867850868 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.150464882275765 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.150464882275765 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.15877376090043754 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.15877376090043754 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.17510857724490006 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.17510857724490006 +e2e_nlg_cleaned,5,average,multiple,0.13156810590511775 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04827458539234547 +gem_xsum,0,median,rouge2_fmeasure,0.04827458539234547 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03467713914457821 +gem_xsum,1,median,rouge2_fmeasure,0.03467713914457821 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03500072349690542 +gem_xsum,2,median,rouge2_fmeasure,0.03500072349690542 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.032789849256510996 +gem_xsum,3,median,rouge2_fmeasure,0.032789849256510996 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.007733619159412637 +gem_xsum,4,median,rouge2_fmeasure,0.007733619159412637 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001356086261746639 +gem_xsum,5,median,rouge2_fmeasure,0.0001356086261746639 +gem_xsum,5,average,multiple,0.026435254179321233 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05407500536237694 +web_nlg_en,0,median,rouge2_fmeasure,0.05407500536237694 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05262981616900198 +web_nlg_en,1,median,rouge2_fmeasure,0.05262981616900198 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05158386270522433 +web_nlg_en,2,median,rouge2_fmeasure,0.05158386270522433 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05181002178423635 +web_nlg_en,3,median,rouge2_fmeasure,0.05181002178423635 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05268512410118113 +web_nlg_en,4,median,rouge2_fmeasure,0.05268512410118113 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05377047978614227 +web_nlg_en,5,median,rouge2_fmeasure,0.05377047978614227 +web_nlg_en,5,average,multiple,0.0527590516513605 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03501787241163656 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03501787241163656 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.048065876938400164 +wiki_lingua_en,1,median,rouge2_fmeasure,0.048065876938400164 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.052858956129486996 +wiki_lingua_en,2,median,rouge2_fmeasure,0.052858956129486996 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.044609780615911226 +wiki_lingua_en,3,median,rouge2_fmeasure,0.044609780615911226 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014341711428955552 +wiki_lingua_en,4,median,rouge2_fmeasure,0.014341711428955552 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002524397418563037 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002524397418563037 +wiki_lingua_en,5,average,multiple,0.03290309915715892 diff --git a/4b284b17bc4seed4/evaluation/generation/merged.json b/4b284b17bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..94b675b20abc98e7e0a569bd26ef14314b8417a7 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4343824765662211, "bleu_stderr": 0.048023006630861446, "rouge1_fmeasure": 0.11266211771001643, "rouge1_fmeasure_stderr": 0.002202104984798198, "rouge1_precision": 0.07515044953734244, "rouge1_precision_stderr": 0.001774290416077829, "rouge1_recall": 0.3050407212838325, "rouge1_recall_stderr": 0.004655551309188598, "rouge2_fmeasure": 0.05407500536237694, "rouge2_fmeasure_stderr": 0.0013873756941219296, "rouge2_precision": 0.03559188393215755, "rouge2_precision_stderr": 0.0010374432747792411, "rouge2_recall": 0.1498962433071719, "rouge2_recall_stderr": 0.0032087777030037298, "rougeL_fmeasure": 0.10804790651080123, "rougeL_fmeasure_stderr": 0.002009937031231657, "rougeL_precision": 0.07169820415924186, "rougeL_precision_stderr": 0.0016045223290318616, "rougeL_recall": 0.29638499524848283, "rougeL_recall_stderr": 0.004517325443303672, "rougeLsum_fmeasure": 0.10688384052591915, "rougeLsum_fmeasure_stderr": 0.0020374002214165765, "rougeLsum_precision": 0.07125637469489439, "rougeLsum_precision_stderr": 0.0016567827741380907, "rougeLsum_recall": 0.29050100506882576, "rougeLsum_recall_stderr": 0.004354982180732837}}, "1": {"PALM_prompt": {"bleu": 0.4777652439857506, "bleu_stderr": 0.04289643324498187, "rouge1_fmeasure": 0.11350112011176325, "rouge1_fmeasure_stderr": 0.0019517961113306424, "rouge1_precision": 0.07428879200903636, "rouge1_precision_stderr": 0.0016536820931879412, "rouge1_recall": 0.359039540510727, "rouge1_recall_stderr": 0.0052229877731486555, "rouge2_fmeasure": 0.05262981616900198, "rouge2_fmeasure_stderr": 0.0012373094619438755, "rouge2_precision": 0.03452382449511785, "rouge2_precision_stderr": 0.0010807787987775395, "rouge2_recall": 0.17163126837814455, "rouge2_recall_stderr": 0.003523343921743217, "rougeL_fmeasure": 0.10617997776780644, "rougeL_fmeasure_stderr": 0.0017768148865928235, "rougeL_precision": 0.0695700878815242, "rougeL_precision_stderr": 0.0015404785622954054, "rougeL_recall": 0.3338436470745751, "rougeL_recall_stderr": 0.004674116193141606, "rougeLsum_fmeasure": 0.10771127727776171, "rougeLsum_fmeasure_stderr": 0.0018405145485082116, "rougeLsum_precision": 0.07064523035468624, "rougeLsum_precision_stderr": 0.001589046589323449, "rougeLsum_recall": 0.3389141371359368, "rougeLsum_recall_stderr": 0.004777787712428965}}, "2": {"PALM_prompt": {"bleu": 0.47073912853002836, "bleu_stderr": 0.023799891455588994, "rouge1_fmeasure": 0.11243445427080336, "rouge1_fmeasure_stderr": 0.0018371970134705366, "rouge1_precision": 0.07183113519874683, "rouge1_precision_stderr": 0.0014196899567873438, "rouge1_recall": 0.37235831339776915, "rouge1_recall_stderr": 0.005151206235548431, "rouge2_fmeasure": 0.05158386270522433, "rouge2_fmeasure_stderr": 0.0011670568217336348, "rouge2_precision": 0.0330272979137574, "rouge2_precision_stderr": 0.0009579517245715237, "rouge2_recall": 0.17886825082331345, "rouge2_recall_stderr": 0.0036331919653676283, "rougeL_fmeasure": 0.10474516482369794, "rougeL_fmeasure_stderr": 0.001692192936065937, "rougeL_precision": 0.0670145094838325, "rougeL_precision_stderr": 0.0013219857603462978, "rougeL_recall": 0.34265516820112996, "rougeL_recall_stderr": 0.004516253867115107, "rougeLsum_fmeasure": 0.10671549790477462, "rougeLsum_fmeasure_stderr": 0.0017441751913910685, "rougeLsum_precision": 0.068270126227838, "rougeLsum_precision_stderr": 0.0013605465670814216, "rougeLsum_recall": 0.3510804564310299, "rougeLsum_recall_stderr": 0.0047143969629961525}}, "3": {"PALM_prompt": {"bleu": 0.5422338874786361, "bleu_stderr": 0.03912554607334545, "rouge1_fmeasure": 0.11349829995272724, "rouge1_fmeasure_stderr": 0.0017634307311235771, "rouge1_precision": 0.07194937398557512, "rouge1_precision_stderr": 0.0012902741156138255, "rouge1_recall": 0.3840439334292797, "rouge1_recall_stderr": 0.0052401116627794715, "rouge2_fmeasure": 0.05181002178423635, "rouge2_fmeasure_stderr": 0.001103265114709934, "rouge2_precision": 0.03270111999859864, "rouge2_precision_stderr": 0.000776043898940337, "rouge2_recall": 0.18536622576620337, "rouge2_recall_stderr": 0.0036796423541916546, "rougeL_fmeasure": 0.1044700461584475, "rougeL_fmeasure_stderr": 0.0016024512570445466, "rougeL_precision": 0.06633681574964742, "rougeL_precision_stderr": 0.001175904758712672, "rougeL_recall": 0.350092838937646, "rougeL_recall_stderr": 0.0045896219703288, "rougeLsum_fmeasure": 0.10726040185593314, "rougeLsum_fmeasure_stderr": 0.0016573279996467407, "rougeLsum_precision": 0.06805768430915979, "rougeLsum_precision_stderr": 0.00121688147385173, "rougeLsum_recall": 0.3613402397970161, "rougeLsum_recall_stderr": 0.004798952558355593}}, "4": {"PALM_prompt": {"bleu": 0.556733317409903, "bleu_stderr": 0.03352390753515493, "rouge1_fmeasure": 0.11456006324772545, "rouge1_fmeasure_stderr": 0.001763820816857359, "rouge1_precision": 0.07284301393690339, "rouge1_precision_stderr": 0.001351281743808941, "rouge1_recall": 0.3854754221896833, "rouge1_recall_stderr": 0.005187297637050295, "rouge2_fmeasure": 0.05268512410118113, "rouge2_fmeasure_stderr": 0.0011172770869965355, "rouge2_precision": 0.03336045659078967, "rouge2_precision_stderr": 0.0008224910529654114, "rouge2_recall": 0.18881325248468273, "rouge2_recall_stderr": 0.0037247751931321566, "rougeL_fmeasure": 0.10533650941563254, "rougeL_fmeasure_stderr": 0.0016013938924540362, "rougeL_precision": 0.06700704392877227, "rougeL_precision_stderr": 0.0012140169040490646, "rougeL_recall": 0.35210849890589585, "rougeL_recall_stderr": 0.004553998796356672, "rougeLsum_fmeasure": 0.1088080335927791, "rougeLsum_fmeasure_stderr": 0.0016684964726357495, "rougeLsum_precision": 0.06918827783916483, "rougeLsum_precision_stderr": 0.0012613399060745452, "rougeLsum_recall": 0.3647320579652569, "rougeLsum_recall_stderr": 0.00478832910750132}}, "5": {"PALM_prompt": {"bleu": 0.6308094101290962, "bleu_stderr": 0.03843181024011274, "rouge1_fmeasure": 0.11566166139252718, "rouge1_fmeasure_stderr": 0.0017379806910797118, "rouge1_precision": 0.0728360785509602, "rouge1_precision_stderr": 0.0012734622852817754, "rouge1_recall": 0.39652294777439107, "rouge1_recall_stderr": 0.005097383629183161, "rouge2_fmeasure": 0.05377047978614227, "rouge2_fmeasure_stderr": 0.0011161518281830932, "rouge2_precision": 0.03377070741483298, "rouge2_precision_stderr": 0.0007898553252946896, "rouge2_recall": 0.1953296193452289, "rouge2_recall_stderr": 0.003725576283521446, "rougeL_fmeasure": 0.10536121627153393, "rougeL_fmeasure_stderr": 0.0015570400795540103, "rougeL_precision": 0.06642237564216569, "rougeL_precision_stderr": 0.0011436420165114316, "rougeL_recall": 0.35936417373341556, "rougeL_recall_stderr": 0.004447613020324495, "rougeLsum_fmeasure": 0.10926286569235492, "rougeLsum_fmeasure_stderr": 0.001640540334412943, "rougeLsum_precision": 0.06888580838305514, "rougeLsum_precision_stderr": 0.001205708611649529, "rougeLsum_recall": 0.3734033598065731, "rougeLsum_recall_stderr": 0.004706921267157343}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4896169854663501, "bleu_stderr": 0.07212072603448405, "rouge1_fmeasure": 0.17643710197803295, "rouge1_fmeasure_stderr": 0.0018604910710955252, "rouge1_precision": 0.15099140781501352, "rouge1_precision_stderr": 0.0019180779262540281, "rouge1_recall": 0.25511321178878194, "rouge1_recall_stderr": 0.002620094969621615, "rouge2_fmeasure": 0.03501787241163656, "rouge2_fmeasure_stderr": 0.0008327348773189318, "rouge2_precision": 0.02976173095153942, "rouge2_precision_stderr": 0.0007456489774853847, "rouge2_recall": 0.05253190898767673, "rouge2_recall_stderr": 0.0013933280275662638, "rougeL_fmeasure": 0.1378946028683743, "rougeL_fmeasure_stderr": 0.0013188502745522404, "rougeL_precision": 0.11641644539555117, "rougeL_precision_stderr": 0.001323494973193759, "rougeL_recall": 0.20461791608644445, "rougeL_recall_stderr": 0.0021348938579779713, "rougeLsum_fmeasure": 0.16062076106529133, "rougeLsum_fmeasure_stderr": 0.0016845987689844018, "rougeLsum_precision": 0.13733969787654335, "rougeLsum_precision_stderr": 0.0017358642947116032, "rougeLsum_recall": 0.23304617406159703, "rougeLsum_recall_stderr": 0.0024190451662485664}}, "1": {"tldr_en": {"bleu": 2.3894615503024323, "bleu_stderr": 0.06415543385241808, "rouge1_fmeasure": 0.2059850704398306, "rouge1_fmeasure_stderr": 0.001963094074210792, "rouge1_precision": 0.1784855985111314, "rouge1_precision_stderr": 0.0021104967475198393, "rouge1_recall": 0.2977670406081463, "rouge1_recall_stderr": 0.0028401853473604423, "rouge2_fmeasure": 0.048065876938400164, "rouge2_fmeasure_stderr": 0.000986652711858889, "rouge2_precision": 0.041397748410955955, "rouge2_precision_stderr": 0.0009208602859554342, "rouge2_recall": 0.07237160586135134, "rouge2_recall_stderr": 0.0016438056937175594, "rougeL_fmeasure": 0.14967719429446022, "rougeL_fmeasure_stderr": 0.0013239741772559975, "rougeL_precision": 0.1283942054903385, "rougeL_precision_stderr": 0.001420523305002391, "rougeL_recall": 0.22236212852683934, "rougeL_recall_stderr": 0.0022511017655621394, "rougeLsum_fmeasure": 0.19238269605170216, "rougeLsum_fmeasure_stderr": 0.0018324753675528188, "rougeLsum_precision": 0.1664591749851555, "rougeLsum_precision_stderr": 0.001961368809410266, "rougeLsum_recall": 0.2792717283183132, "rougeLsum_recall_stderr": 0.002704948720241127}}, "2": {"tldr_en": {"bleu": 2.802338204417689, "bleu_stderr": 0.06864786924899859, "rouge1_fmeasure": 0.21454371863359756, "rouge1_fmeasure_stderr": 0.001925599546701441, "rouge1_precision": 0.1897032630760182, "rouge1_precision_stderr": 0.0022368455029635258, "rouge1_recall": 0.3066928146535342, "rouge1_recall_stderr": 0.002802643642933985, "rouge2_fmeasure": 0.052858956129486996, "rouge2_fmeasure_stderr": 0.0010490961352643794, "rouge2_precision": 0.04731501274354434, "rouge2_precision_stderr": 0.0011062723970647515, "rouge2_recall": 0.07790969048229124, "rouge2_recall_stderr": 0.0016999896420780702, "rougeL_fmeasure": 0.1561270245197477, "rougeL_fmeasure_stderr": 0.0013458695850914661, "rougeL_precision": 0.1371759400598415, "rougeL_precision_stderr": 0.001604972782557924, "rougeL_recall": 0.22832634865694745, "rougeL_recall_stderr": 0.002243861215065516, "rougeLsum_fmeasure": 0.20112352002530945, "rougeLsum_fmeasure_stderr": 0.0018023852270730636, "rougeLsum_precision": 0.17773338080575501, "rougeLsum_precision_stderr": 0.0021042652190900865, "rougeLsum_recall": 0.28819050073062974, "rougeLsum_recall_stderr": 0.002659179999974621}}, "3": {"tldr_en": {"bleu": 2.885675275089151, "bleu_stderr": 0.05882864011318359, "rouge1_fmeasure": 0.17866329514940266, "rouge1_fmeasure_stderr": 0.002248902913307235, "rouge1_precision": 0.164791315350049, "rouge1_precision_stderr": 0.002544569650584157, "rouge1_recall": 0.25459613014501403, "rouge1_recall_stderr": 0.003306705101721616, "rouge2_fmeasure": 0.044609780615911226, "rouge2_fmeasure_stderr": 0.0010228881021056443, "rouge2_precision": 0.04076673951979605, "rouge2_precision_stderr": 0.0010836340049991495, "rouge2_recall": 0.06599509861885819, "rouge2_recall_stderr": 0.0016626651006712032, "rougeL_fmeasure": 0.13081554608705287, "rougeL_fmeasure_stderr": 0.0016224247816486106, "rougeL_precision": 0.12069704799979292, "rougeL_precision_stderr": 0.0019208739461812644, "rougeL_recall": 0.19074830444178492, "rougeL_recall_stderr": 0.0026182630481534513, "rougeLsum_fmeasure": 0.16737756927690062, "rougeLsum_fmeasure_stderr": 0.002110717879040218, "rougeLsum_precision": 0.15436431494976136, "rougeLsum_precision_stderr": 0.0023979790621534345, "rougeLsum_recall": 0.23936051366475677, "rougeLsum_recall_stderr": 0.003153507229159192}}, "4": {"tldr_en": {"bleu": 0.6376672457997873, "bleu_stderr": 0.04637276744359661, "rouge1_fmeasure": 0.05799142262407675, "rouge1_fmeasure_stderr": 0.0019737114395287285, "rouge1_precision": 0.05507100416731384, "rouge1_precision_stderr": 0.002085930904803074, "rouge1_recall": 0.0852505791667054, "rouge1_recall_stderr": 0.0029574668511788207, "rouge2_fmeasure": 0.014341711428955552, "rouge2_fmeasure_stderr": 0.0007033787935002822, "rouge2_precision": 0.01356541169159794, "rouge2_precision_stderr": 0.0007797331675838104, "rouge2_recall": 0.02208226811346249, "rouge2_recall_stderr": 0.0011738851444263426, "rougeL_fmeasure": 0.04342836324139622, "rougeL_fmeasure_stderr": 0.0014660653045381622, "rougeL_precision": 0.04131835587423735, "rougeL_precision_stderr": 0.0015840597026356054, "rougeL_recall": 0.06520383358681357, "rougeL_recall_stderr": 0.0023156128280409144, "rougeLsum_fmeasure": 0.05385535349179482, "rougeLsum_fmeasure_stderr": 0.0018338162657545842, "rougeLsum_precision": 0.05130679646224984, "rougeLsum_precision_stderr": 0.0019569857737155353, "rougeLsum_recall": 0.07917622369225202, "rougeLsum_recall_stderr": 0.0027610727268320553}}, "5": {"tldr_en": {"bleu": 8.208243477346069e-07, "bleu_stderr": 1.678978878468568e-06, "rouge1_fmeasure": 0.009395603794958232, "rouge1_fmeasure_stderr": 0.0008882674507780892, "rouge1_precision": 0.009287554139206585, "rouge1_precision_stderr": 0.0009771031310070455, "rouge1_recall": 0.014121586207336037, "rouge1_recall_stderr": 0.0013749087975557977, "rouge2_fmeasure": 0.002524397418563037, "rouge2_fmeasure_stderr": 0.0003096322407869578, "rouge2_precision": 0.0024832360926263118, "rouge2_precision_stderr": 0.0004279391961552875, "rouge2_recall": 0.004145687467440537, "rouge2_recall_stderr": 0.0005559391301711729, "rougeL_fmeasure": 0.0069394731437362024, "rougeL_fmeasure_stderr": 0.0006530086772293193, "rougeL_precision": 0.007001785329480834, "rougeL_precision_stderr": 0.0007757287042749915, "rougeL_recall": 0.010784174279067018, "rougeL_recall_stderr": 0.0010877434670126132, "rougeLsum_fmeasure": 0.008756530255363647, "rougeLsum_fmeasure_stderr": 0.0008258296713068699, "rougeLsum_precision": 0.008663341441816887, "rougeLsum_precision_stderr": 0.0009176564706978443, "rougeLsum_recall": 0.013318862828132983, "rougeLsum_recall_stderr": 0.001308577095420894}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.2912180823667536, "bleu_stderr": 0.04696171018074308, "rouge1_fmeasure": 0.13196601070831873, "rouge1_fmeasure_stderr": 0.0019052440963938729, "rouge1_precision": 0.12047198117053222, "rouge1_precision_stderr": 0.002422706752112519, "rouge1_recall": 0.18343958014854436, "rouge1_recall_stderr": 0.0025075189521513584, "rouge2_fmeasure": 0.03966758182207007, "rouge2_fmeasure_stderr": 0.0009922097782735988, "rouge2_precision": 0.03529024774431325, "rouge2_precision_stderr": 0.0010291954001165721, "rouge2_recall": 0.05621572365217293, "rouge2_recall_stderr": 0.0014393304564782369, "rougeL_fmeasure": 0.12747049004214134, "rougeL_fmeasure_stderr": 0.0017889313386553027, "rougeL_precision": 0.11445891994934188, "rougeL_precision_stderr": 0.002171944159936819, "rougeL_recall": 0.17892248577238187, "rougeL_recall_stderr": 0.0024322208362124683, "rougeLsum_fmeasure": 0.11526688124292425, "rougeLsum_fmeasure_stderr": 0.0016500754447542722, "rougeLsum_precision": 0.1051585199077769, "rougeLsum_precision_stderr": 0.002123665531923284, "rougeLsum_recall": 0.16102718478893413, "rougeLsum_recall_stderr": 0.002187988253994963}}, "1": {"generate_text_restaurant": {"bleu": 5.158705932289786, "bleu_stderr": 0.06703304610399921, "rouge1_fmeasure": 0.29675375590537084, "rouge1_fmeasure_stderr": 0.0018062329364285303, "rouge1_precision": 0.22812040284605375, "rouge1_precision_stderr": 0.0016480972989796332, "rouge1_recall": 0.4578694253287623, "rouge1_recall_stderr": 0.0028531834777872776, "rouge2_fmeasure": 0.12345720450902517, "rouge2_fmeasure_stderr": 0.0012773505784629316, "rouge2_precision": 0.09439151368502757, "rouge2_precision_stderr": 0.001087562485720737, "rouge2_recall": 0.1947739254358195, "rouge2_recall_stderr": 0.0020983599381460523, "rougeL_fmeasure": 0.2444332716499778, "rougeL_fmeasure_stderr": 0.001401848033777242, "rougeL_precision": 0.18717327314696164, "rougeL_precision_stderr": 0.0012557263702334491, "rougeL_recall": 0.3801282135359848, "rougeL_recall_stderr": 0.002436732925310422, "rougeLsum_fmeasure": 0.24007972105838774, "rougeLsum_fmeasure_stderr": 0.0017011987761281766, "rougeLsum_precision": 0.18467703038593095, "rougeLsum_precision_stderr": 0.001513831526446134, "rougeLsum_recall": 0.37053817605298633, "rougeLsum_recall_stderr": 0.002685039659209729}}, "2": {"generate_text_restaurant": {"bleu": 5.690375469842014, "bleu_stderr": 0.0824383936230684, "rouge1_fmeasure": 0.3119156609058321, "rouge1_fmeasure_stderr": 0.001824131658302162, "rouge1_precision": 0.24398832908607115, "rouge1_precision_stderr": 0.0017150907949842408, "rouge1_recall": 0.4646895811049601, "rouge1_recall_stderr": 0.002858271661635396, "rouge2_fmeasure": 0.14193662867850868, "rouge2_fmeasure_stderr": 0.001353113196686301, "rouge2_precision": 0.1104767425139252, "rouge2_precision_stderr": 0.00118939445573815, "rouge2_recall": 0.21581287138558555, "rouge2_recall_stderr": 0.0021618573233875907, "rougeL_fmeasure": 0.24430052562824675, "rougeL_fmeasure_stderr": 0.0015011602139692235, "rougeL_precision": 0.190435504439295, "rougeL_precision_stderr": 0.0013620859984245742, "rougeL_recall": 0.3666980844771318, "rougeL_recall_stderr": 0.0025349587698896693, "rougeLsum_fmeasure": 0.2589305200172953, "rougeLsum_fmeasure_stderr": 0.0017296796631600313, "rougeLsum_precision": 0.20272516091446563, "rougeLsum_precision_stderr": 0.00158561733134627, "rougeLsum_recall": 0.3854917658051633, "rougeLsum_recall_stderr": 0.0026938058133417295}}, "3": {"generate_text_restaurant": {"bleu": 6.0439736641102115, "bleu_stderr": 0.07379602017794676, "rouge1_fmeasure": 0.31661697205997225, "rouge1_fmeasure_stderr": 0.0019286637244208624, "rouge1_precision": 0.2593071001052398, "rouge1_precision_stderr": 0.0022412256780430496, "rouge1_recall": 0.455064264350734, "rouge1_recall_stderr": 0.0028624784422107666, "rouge2_fmeasure": 0.150464882275765, "rouge2_fmeasure_stderr": 0.0014638819250874162, "rouge2_precision": 0.12300741763661184, "rouge2_precision_stderr": 0.0014855909298141189, "rouge2_recall": 0.2199514348870645, "rouge2_recall_stderr": 0.002208467149338667, "rougeL_fmeasure": 0.2455820937995388, "rougeL_fmeasure_stderr": 0.0015680944117292498, "rougeL_precision": 0.20036090144703264, "rougeL_precision_stderr": 0.0017495653457872179, "rougeL_recall": 0.3556053585061198, "rougeL_recall_stderr": 0.0025198458005494056, "rougeLsum_fmeasure": 0.26559516160958013, "rougeLsum_fmeasure_stderr": 0.0018357436220606, "rougeLsum_precision": 0.2176291277013681, "rougeLsum_precision_stderr": 0.002020385835890593, "rougeLsum_recall": 0.38138413858584713, "rougeLsum_recall_stderr": 0.0027063149785747386}}, "4": {"generate_text_restaurant": {"bleu": 6.354881393754141, "bleu_stderr": 0.09890740197695579, "rouge1_fmeasure": 0.32913790714361146, "rouge1_fmeasure_stderr": 0.0021594310382399657, "rouge1_precision": 0.29203333682579574, "rouge1_precision_stderr": 0.002999280746128511, "rouge1_recall": 0.4413570813281445, "rouge1_recall_stderr": 0.0027581195981604377, "rouge2_fmeasure": 0.15877376090043754, "rouge2_fmeasure_stderr": 0.0016333690903858943, "rouge2_precision": 0.1417297506283129, "rouge2_precision_stderr": 0.0019738294219546937, "rouge2_recall": 0.21519554056054305, "rouge2_recall_stderr": 0.0021602256883732117, "rougeL_fmeasure": 0.2534749321081447, "rougeL_fmeasure_stderr": 0.001774545693280727, "rougeL_precision": 0.22425926194809623, "rougeL_precision_stderr": 0.002386386479402504, "rougeL_recall": 0.3424977004572995, "rougeL_recall_stderr": 0.002467779605028914, "rougeLsum_fmeasure": 0.27689613522254997, "rougeLsum_fmeasure_stderr": 0.002039497240519239, "rougeLsum_precision": 0.24618983515410067, "rougeLsum_precision_stderr": 0.0027027856125378625, "rougeLsum_recall": 0.370614829709113, "rougeLsum_recall_stderr": 0.0026083449397794707}}, "5": {"generate_text_restaurant": {"bleu": 7.1909177856911, "bleu_stderr": 0.10265642450958426, "rouge1_fmeasure": 0.35823495309511205, "rouge1_fmeasure_stderr": 0.002434641300267102, "rouge1_precision": 0.35432909163852516, "rouge1_precision_stderr": 0.0037045739601193023, "rouge1_recall": 0.4323465797148469, "rouge1_recall_stderr": 0.0027540185485072804, "rouge2_fmeasure": 0.17510857724490006, "rouge2_fmeasure_stderr": 0.0018284980975542188, "rouge2_precision": 0.1751713501618444, "rouge2_precision_stderr": 0.002455924974836673, "rouge2_recall": 0.2128138263727067, "rouge2_recall_stderr": 0.0021369605527865143, "rougeL_fmeasure": 0.2721424516768009, "rougeL_fmeasure_stderr": 0.002009075407458165, "rougeL_precision": 0.26880238843108867, "rougeL_precision_stderr": 0.002974234215185799, "rougeL_recall": 0.3307966526066714, "rougeL_recall_stderr": 0.0024430103047921436, "rougeLsum_fmeasure": 0.30104710211900604, "rougeLsum_fmeasure_stderr": 0.002271337955877024, "rougeLsum_precision": 0.2977676212670809, "rougeLsum_precision_stderr": 0.0033011944868462984, "rougeLsum_recall": 0.36361464138817934, "rougeLsum_recall_stderr": 0.0026219890555875887}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.11113784441646, "bleu_stderr": 0.10616099303569221, "rouge1_fmeasure": 0.20872655099780005, "rouge1_fmeasure_stderr": 0.002564935573693335, "rouge1_precision": 0.15890406603000953, "rouge1_precision_stderr": 0.002254955655153118, "rouge1_recall": 0.3419386542895376, "rouge1_recall_stderr": 0.004441817132952302, "rouge2_fmeasure": 0.04827458539234547, "rouge2_fmeasure_stderr": 0.0016311125149856224, "rouge2_precision": 0.035867673737784864, "rouge2_precision_stderr": 0.0012355715113504212, "rouge2_recall": 0.08210293496350446, "rouge2_recall_stderr": 0.0028582946303061215, "rougeL_fmeasure": 0.15600377002951135, "rougeL_fmeasure_stderr": 0.001956835201349077, "rougeL_precision": 0.1184088695231563, "rougeL_precision_stderr": 0.0016748896131188985, "rougeL_recall": 0.2575206419404795, "rougeL_recall_stderr": 0.0035534295864630325, "rougeLsum_fmeasure": 0.16431319751526183, "rougeLsum_fmeasure_stderr": 0.0021424109063426598, "rougeLsum_precision": 0.12439059930825258, "rougeLsum_precision_stderr": 0.0017863554797583551, "rougeLsum_recall": 0.27156070716777114, "rougeLsum_recall_stderr": 0.003862307786824889}}, "1": {"article_DOC_summary": {"bleu": 1.3965109587674343, "bleu_stderr": 0.06401667731693655, "rouge1_fmeasure": 0.17408107023086306, "rouge1_fmeasure_stderr": 0.0025007559781842124, "rouge1_precision": 0.12357832592160078, "rouge1_precision_stderr": 0.0018527507858758536, "rouge1_recall": 0.30684042727956246, "rouge1_recall_stderr": 0.004305759204889508, "rouge2_fmeasure": 0.03467713914457821, "rouge2_fmeasure_stderr": 0.001400238782790392, "rouge2_precision": 0.024345707251415688, "rouge2_precision_stderr": 0.000984809248208348, "rouge2_recall": 0.06298143710264606, "rouge2_recall_stderr": 0.002631710066142032, "rougeL_fmeasure": 0.1365127835924807, "rougeL_fmeasure_stderr": 0.0018703812542303563, "rougeL_precision": 0.09673384674343337, "rougeL_precision_stderr": 0.0013741741706308965, "rougeL_recall": 0.24213903819601934, "rougeL_recall_stderr": 0.0033704247478958086, "rougeLsum_fmeasure": 0.13958989051576162, "rougeLsum_fmeasure_stderr": 0.002058226884434151, "rougeLsum_precision": 0.09883792769116156, "rougeLsum_precision_stderr": 0.0015010324363623785, "rougeLsum_recall": 0.24767622402670803, "rougeLsum_recall_stderr": 0.0036851269749209087}}, "2": {"article_DOC_summary": {"bleu": 1.3350781966689242, "bleu_stderr": 0.05462322449688662, "rouge1_fmeasure": 0.17378684047409287, "rouge1_fmeasure_stderr": 0.00251306890551453, "rouge1_precision": 0.1232443202762208, "rouge1_precision_stderr": 0.0018597860894763215, "rouge1_recall": 0.3066997407165842, "rouge1_recall_stderr": 0.004281850801696637, "rouge2_fmeasure": 0.03500072349690542, "rouge2_fmeasure_stderr": 0.0014064236687193516, "rouge2_precision": 0.024538738271362173, "rouge2_precision_stderr": 0.000984589784225774, "rouge2_recall": 0.06382645423995705, "rouge2_recall_stderr": 0.002658487797915825, "rougeL_fmeasure": 0.13665315634539602, "rougeL_fmeasure_stderr": 0.0018588804586632999, "rougeL_precision": 0.09674319475304347, "rougeL_precision_stderr": 0.0013673643974684935, "rougeL_recall": 0.24265549182073776, "rougeL_recall_stderr": 0.003318826657004254, "rougeLsum_fmeasure": 0.1379932465252965, "rougeLsum_fmeasure_stderr": 0.0020476151006960097, "rougeLsum_precision": 0.09762025828076111, "rougeLsum_precision_stderr": 0.0014912532446376685, "rougeLsum_recall": 0.24533240423674169, "rougeLsum_recall_stderr": 0.003667805740862544}}, "3": {"article_DOC_summary": {"bleu": 1.3877489685443374, "bleu_stderr": 0.08723150605256752, "rouge1_fmeasure": 0.16668392777759847, "rouge1_fmeasure_stderr": 0.0025609126095560947, "rouge1_precision": 0.12074809695003069, "rouge1_precision_stderr": 0.0019691856555922723, "rouge1_recall": 0.28945836357231924, "rouge1_recall_stderr": 0.0044825833610854715, "rouge2_fmeasure": 0.032789849256510996, "rouge2_fmeasure_stderr": 0.0013657128342140336, "rouge2_precision": 0.023345501376425448, "rouge2_precision_stderr": 0.000988985937966658, "rouge2_recall": 0.05944080227614754, "rouge2_recall_stderr": 0.00256169341535352, "rougeL_fmeasure": 0.13209664627192183, "rougeL_fmeasure_stderr": 0.0019549880329955776, "rougeL_precision": 0.09555616349341924, "rougeL_precision_stderr": 0.0014946150544747267, "rougeL_recall": 0.2306803615801865, "rougeL_recall_stderr": 0.003552753971276667, "rougeLsum_fmeasure": 0.13393526771830042, "rougeLsum_fmeasure_stderr": 0.002116269943439794, "rougeLsum_precision": 0.0967664845472343, "rougeLsum_precision_stderr": 0.0015955111349148581, "rougeLsum_recall": 0.23437751662334644, "rougeLsum_recall_stderr": 0.0038695430932882123}}, "4": {"article_DOC_summary": {"bleu": 0.498735683070385, "bleu_stderr": 0.09460282574162138, "rouge1_fmeasure": 0.045612195294077, "rouge1_fmeasure_stderr": 0.002577411858944101, "rouge1_precision": 0.038291226660169694, "rouge1_precision_stderr": 0.0023827489261002907, "rouge1_recall": 0.07178712133575316, "rouge1_recall_stderr": 0.004098367009523302, "rouge2_fmeasure": 0.007733619159412637, "rouge2_fmeasure_stderr": 0.0007744182087008182, "rouge2_precision": 0.00670712896007596, "rouge2_precision_stderr": 0.0010524518148372528, "rouge2_recall": 0.012716363765006978, "rouge2_recall_stderr": 0.0012902939337883345, "rougeL_fmeasure": 0.03561894185219706, "rougeL_fmeasure_stderr": 0.002010352043305738, "rougeL_precision": 0.030604549821754326, "rougeL_precision_stderr": 0.0020230191034346237, "rougeL_recall": 0.056227302038037344, "rougeL_recall_stderr": 0.0032465166908807004, "rougeLsum_fmeasure": 0.03655668709145729, "rougeLsum_fmeasure_stderr": 0.0020865614848517965, "rougeLsum_precision": 0.03137303661342675, "rougeLsum_precision_stderr": 0.002074367350921982, "rougeLsum_recall": 0.05749454987519647, "rougeLsum_recall_stderr": 0.0033349680461757353}}, "5": {"article_DOC_summary": {"bleu": 5.598556081053473e-38, "bleu_stderr": 1.359631471631395e-33, "rouge1_fmeasure": 0.0022121024332002947, "rouge1_fmeasure_stderr": 0.0006067600008391147, "rouge1_precision": 0.00249226785896568, "rouge1_precision_stderr": 0.0006879879996776797, "rouge1_recall": 0.002055326448318596, "rouge1_recall_stderr": 0.0005613935086340863, "rouge2_fmeasure": 0.0001356086261746639, "rouge2_fmeasure_stderr": 0.00010000173916970802, "rouge2_precision": 0.00014889460644177625, "rouge2_precision_stderr": 0.00010929355651952094, "rouge2_recall": 0.0001245609736175774, "rouge2_recall_stderr": 9.221895207466138e-05, "rougeL_fmeasure": 0.001582264730874981, "rougeL_fmeasure_stderr": 0.00042536983909529074, "rougeL_precision": 0.001786603570440243, "rougeL_precision_stderr": 0.00048572134979040996, "rougeL_recall": 0.001477802641032664, "rougeL_recall_stderr": 0.000397466804454887, "rougeLsum_fmeasure": 0.0016327137269399594, "rougeLsum_fmeasure_stderr": 0.0004456806927818982, "rougeLsum_precision": 0.0018402056287592826, "rougeLsum_precision_stderr": 0.0005058536223241939, "rougeLsum_recall": 0.0015254489150940322, "rougeLsum_recall_stderr": 0.0004168475091777595}}}} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..f707cd5e15007194d18d6aba5e6df4984e1abb18 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811487,0 +anli_r2,acc,0.337,0.014955087918653605,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.2764505119453925,0.013069662474252425,0 +arc_challenge,acc_norm,0.2960750853242321,0.013340916085246258,0 +arc_easy,acc,0.5963804713804713,0.01006736896034822,0 +arc_easy,acc_norm,0.5382996632996633,0.010229639820610512,0 +boolq,acc,0.6296636085626911,0.008445882436783665,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.3312277706643904,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.4765982871937861,0.004984313205791442,0 +hellaswag,acc_norm,0.6216889065923122,0.004839746491523515,0 +piqa,acc,0.750816104461371,0.010091882770120216,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267312,0 +rte,acc,0.5740072202166066,0.02976495674177765,0 +sciq,acc,0.853,0.011203415395160336,0 +sciq,acc_norm,0.762,0.013473586661967222,0 +storycloze_2016,acc,0.7194013896312133,0.010389809647288816,0 +winogrande,acc,0.5714285714285714,0.013908353814606696,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json deleted file mode 100644 index 924cfee094bf97ba30f72c57e1f9792b6ad2397e..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811487 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653605 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.3312277706643904 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - }, - "hellaswag": { - "acc": 0.4765982871937861, - "acc_stderr": 0.004984313205791442, - "acc_norm": 0.6216889065923122, - "acc_norm_stderr": 0.004839746491523515 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.02976495674177765 - }, - "winogrande": { - "acc": 0.5714285714285714, - "acc_stderr": 0.013908353814606696 - }, - "storycloze_2016": { - "acc": 0.7194013896312133, - "acc_stderr": 0.010389809647288816 - }, - "boolq": { - "acc": 0.6296636085626911, - "acc_stderr": 0.008445882436783665 - }, - "arc_easy": { - "acc": 0.5963804713804713, - "acc_stderr": 0.01006736896034822, - "acc_norm": 0.5382996632996633, - "acc_norm_stderr": 0.010229639820610512 - }, - "arc_challenge": { - "acc": 0.2764505119453925, - "acc_stderr": 0.013069662474252425, - "acc_norm": 0.2960750853242321, - "acc_norm_stderr": 0.013340916085246258 - }, - "sciq": { - "acc": 0.853, - "acc_stderr": 0.011203415395160336, - "acc_norm": 0.762, - "acc_norm_stderr": 0.013473586661967222 - }, - "piqa": { - "acc": 0.750816104461371, - "acc_stderr": 0.010091882770120216, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267312 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..61e7f6444143c8ad2e2172e4a236dbc2b3106632 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.317,0.014721675438880224,0 +anli_r3,acc,0.345,0.01372842153945488,0 +arc_challenge,acc,0.2901023890784983,0.013261573677520764,0 +arc_challenge,acc_norm,0.318259385665529,0.013611993916971453,0 +arc_easy,acc,0.6123737373737373,0.00999730791444761,0 +arc_easy,acc_norm,0.5715488215488216,0.010154195733990972,0 +boolq,acc,0.6146788990825688,0.008511930879680642,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3566561844863732,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4736108344951205,0.00498282691668715,0 +hellaswag,acc_norm,0.6191993626767576,0.004845912857338672,0 +piqa,acc,0.7551686615886833,0.010032309105568798,0 +piqa,acc_norm,0.7529923830250272,0.010062268140772636,0 +rte,acc,0.5740072202166066,0.029764956741777645,0 +sciq,acc,0.898,0.00957536880165389,0 +sciq,acc_norm,0.875,0.010463483381956722,0 +storycloze_2016,acc,0.7124532335649385,0.010466744473098357,0 +winogrande,acc,0.5706393054459353,0.013911537499969165,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-01_1shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-01_1shots_backup.json deleted file mode 100644 index cc655b5b1ebb8bd505487028a564c86ce4d58694..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-01_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.317, - "acc_stderr": 0.014721675438880224 - }, - "anli_r3": { - "acc": 0.345, - "acc_stderr": 0.01372842153945488 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3566561844863732 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4736108344951205, - "acc_stderr": 0.00498282691668715, - "acc_norm": 0.6191993626767576, - "acc_norm_stderr": 0.004845912857338672 - }, - "rte": { - "acc": 0.5740072202166066, - "acc_stderr": 0.029764956741777645 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969165 - }, - "storycloze_2016": { - "acc": 0.7124532335649385, - "acc_stderr": 0.010466744473098357 - }, - "boolq": { - "acc": 0.6146788990825688, - "acc_stderr": 0.008511930879680642 - }, - "arc_easy": { - "acc": 0.6123737373737373, - "acc_stderr": 0.00999730791444761, - "acc_norm": 0.5715488215488216, - "acc_norm_stderr": 0.010154195733990972 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.013261573677520764, - "acc_norm": 0.318259385665529, - "acc_norm_stderr": 0.013611993916971453 - }, - "sciq": { - "acc": 0.898, - "acc_stderr": 0.00957536880165389, - "acc_norm": 0.875, - "acc_norm_stderr": 0.010463483381956722 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568798, - "acc_norm": 0.7529923830250272, - "acc_norm_stderr": 0.010062268140772636 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..773fe67163e16c60bcead5f70732f3709539380d --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363937,0 +anli_r2,acc,0.34,0.014987482264363935,0 +anli_r3,acc,0.33416666666666667,0.013622434813136772,0 +arc_challenge,acc,0.2832764505119454,0.013167478735134576,0 +arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0 +arc_easy,acc,0.6241582491582491,0.009938436373170635,0 +arc_easy,acc_norm,0.5951178451178452,0.0100724239603957,0 +boolq,acc,0.618348623853211,0.008496550741178258,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2654970760233918,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.4720175263891655,0.004981961097590805,0 +hellaswag,acc_norm,0.6224855606452898,0.004837744647345714,0 +piqa,acc,0.7595212187159956,0.009971345364651076,0 +piqa,acc_norm,0.7671381936887922,0.009861236071080753,0 +rte,acc,0.592057761732852,0.029581952519606186,0 +sciq,acc,0.902,0.009406619184621235,0 +sciq,acc_norm,0.885,0.01009340759490463,0 +storycloze_2016,acc,0.7161945483698557,0.01042569627973092,0 +winogrande,acc,0.5603788476716653,0.01394964977601569,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json deleted file mode 100644 index 95669a2298841e450a6bf263fe4a11837cb27823..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363935 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136772 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2654970760233918 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.4720175263891655, - "acc_stderr": 0.004981961097590805, - "acc_norm": 0.6224855606452898, - "acc_norm_stderr": 0.004837744647345714 - }, - "rte": { - "acc": 0.592057761732852, - "acc_stderr": 0.029581952519606186 - }, - "winogrande": { - "acc": 0.5603788476716653, - "acc_stderr": 0.01394964977601569 - }, - "storycloze_2016": { - "acc": 0.7161945483698557, - "acc_stderr": 0.01042569627973092 - }, - "boolq": { - "acc": 0.618348623853211, - "acc_stderr": 0.008496550741178258 - }, - "arc_easy": { - "acc": 0.6241582491582491, - "acc_stderr": 0.009938436373170635, - "acc_norm": 0.5951178451178452, - "acc_norm_stderr": 0.0100724239603957 - }, - "arc_challenge": { - "acc": 0.2832764505119454, - "acc_stderr": 0.013167478735134576, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344003 - }, - "sciq": { - "acc": 0.902, - "acc_stderr": 0.009406619184621235, - "acc_norm": 0.885, - "acc_norm_stderr": 0.01009340759490463 - }, - "piqa": { - "acc": 0.7595212187159956, - "acc_stderr": 0.009971345364651076, - "acc_norm": 0.7671381936887922, - "acc_norm_stderr": 0.009861236071080753 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..72b13aa4a716db2bb50b34351c8bb3b6525301b4 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738863,0 +anli_r2,acc,0.373,0.01530049362292281,0 +anli_r3,acc,0.3475,0.013751753243291852,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009026,0 +arc_challenge,acc_norm,0.3225255972696246,0.01365998089427737,0 +arc_easy,acc,0.6216329966329966,0.009951575683331949,0 +arc_easy,acc_norm,0.6018518518518519,0.010044662374653396,0 +boolq,acc,0.617125382262997,0.008501734385335953,1 +cb,acc,0.5357142857142857,0.06724777654937658,1 +cb,f1,0.466241360978203,,1 +copa,acc,0.8,0.04020151261036845,0 +hellaswag,acc,0.47231627165903206,0.004982127315605219,0 +hellaswag,acc_norm,0.6231826329416451,0.004835981632401606,0 +piqa,acc,0.7540805223068553,0.010047331865625193,0 +piqa,acc_norm,0.7627856365614799,0.009924694933586371,0 +rte,acc,0.6028880866425993,0.029452371378346828,0 +sciq,acc,0.905,0.009276910103103286,0 +sciq,acc_norm,0.891,0.009859828407037186,0 +storycloze_2016,acc,0.7231427044361304,0.01034711289027692,0 +winogrande,acc,0.5840568271507498,0.013852485356798252,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-02_3shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-02_3shots_backup.json deleted file mode 100644 index 9c08ebed69c591eda298ddf73bb6f8347007e433..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-02_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738863 - }, - "anli_r2": { - "acc": 0.373, - "acc_stderr": 0.01530049362292281 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291852 - }, - "cb": { - "acc": 0.5357142857142857, - "acc_stderr": 0.06724777654937658, - "f1": 0.466241360978203 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036845 - }, - "hellaswag": { - "acc": 0.47231627165903206, - "acc_stderr": 0.004982127315605219, - "acc_norm": 0.6231826329416451, - "acc_norm_stderr": 0.004835981632401606 - }, - "rte": { - "acc": 0.6028880866425993, - "acc_stderr": 0.029452371378346828 - }, - "winogrande": { - "acc": 0.5840568271507498, - "acc_stderr": 0.013852485356798252 - }, - "storycloze_2016": { - "acc": 0.7231427044361304, - "acc_stderr": 0.01034711289027692 - }, - "boolq": { - "acc": 0.617125382262997, - "acc_stderr": 0.008501734385335953 - }, - "arc_easy": { - "acc": 0.6216329966329966, - "acc_stderr": 0.009951575683331949, - "acc_norm": 0.6018518518518519, - "acc_norm_stderr": 0.010044662374653396 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009026, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.01365998089427737 - }, - "sciq": { - "acc": 0.905, - "acc_stderr": 0.009276910103103286, - "acc_norm": 0.891, - "acc_norm_stderr": 0.009859828407037186 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.010047331865625193, - "acc_norm": 0.7627856365614799, - "acc_norm_stderr": 0.009924694933586371 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..1cc168f0c4201acd92d742892b6682de70cf4023 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456736,0 +anli_r2,acc,0.364,0.015222868840522024,0 +anli_r3,acc,0.3616666666666667,0.013876131663123877,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009022,0 +arc_challenge,acc_norm,0.3242320819112628,0.013678810399518822,0 +arc_easy,acc,0.625,0.009933992677987828,0 +arc_easy,acc_norm,0.6132154882154882,0.009993308355370966,0 +boolq,acc,0.6247706422018349,0.008468397820914277,1 +cb,acc,0.5535714285714286,0.06703189227942395,1 +cb,f1,0.38235294117647056,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.47321250746863175,0.004982615233057104,0 +hellaswag,acc_norm,0.6276638119896435,0.0048243930768266064,0 +piqa,acc,0.7573449401523396,0.010002002569708698,0 +piqa,acc_norm,0.7622415669205659,0.009932525779525492,0 +rte,acc,0.5956678700361011,0.029540420517619723,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.902,0.009406619184621219,0 +storycloze_2016,acc,0.7140566541956174,0.010449259851345843,0 +winogrande,acc,0.5674822415153907,0.013923911578623814,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json deleted file mode 100644 index de3de8e74c728163b0692afcff6d08f647db9b1c..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456736 - }, - "anli_r2": { - "acc": 0.364, - "acc_stderr": 0.015222868840522024 - }, - "anli_r3": { - "acc": 0.3616666666666667, - "acc_stderr": 0.013876131663123877 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942395, - "f1": 0.38235294117647056 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.47321250746863175, - "acc_stderr": 0.004982615233057104, - "acc_norm": 0.6276638119896435, - "acc_norm_stderr": 0.0048243930768266064 - }, - "rte": { - "acc": 0.5956678700361011, - "acc_stderr": 0.029540420517619723 - }, - "winogrande": { - "acc": 0.5674822415153907, - "acc_stderr": 0.013923911578623814 - }, - "storycloze_2016": { - "acc": 0.7140566541956174, - "acc_stderr": 0.010449259851345843 - }, - "boolq": { - "acc": 0.6247706422018349, - "acc_stderr": 0.008468397820914277 - }, - "arc_easy": { - "acc": 0.625, - "acc_stderr": 0.009933992677987828, - "acc_norm": 0.6132154882154882, - "acc_norm_stderr": 0.009993308355370966 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009022, - "acc_norm": 0.3242320819112628, - "acc_norm_stderr": 0.013678810399518822 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.902, - "acc_norm_stderr": 0.009406619184621219 - }, - "piqa": { - "acc": 0.7573449401523396, - "acc_stderr": 0.010002002569708698, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525492 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5.csv b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..d3292513b9f855f469ead4625b629b39f99576f8 --- /dev/null +++ b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.01505026612756444,0 +anli_r2,acc,0.338,0.01496596071022448,0 +anli_r3,acc,0.3516666666666667,0.013789711695404789,0 +arc_challenge,acc,0.2909556313993174,0.01327307786590759,0 +arc_challenge,acc_norm,0.318259385665529,0.013611993916971453,0 +arc_easy,acc,0.6308922558922558,0.009901987410242738,0 +arc_easy,acc_norm,0.617003367003367,0.00997492038453648,0 +boolq,acc,0.6269113149847095,0.008458661252058382,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3115193264446996,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4707229635530771,0.0049812201358823294,0 +hellaswag,acc_norm,0.6292571200955985,0.004820166002253066,0 +piqa,acc,0.7562568008705114,0.010017199471500617,0 +piqa,acc_norm,0.766050054406964,0.00987723689513744,0 +rte,acc,0.5884476534296029,0.029621832222417196,0 +sciq,acc,0.911,0.009008893392651526,0 +sciq,acc_norm,0.905,0.0092769101031033,0 +storycloze_2016,acc,0.7242116515232496,0.010334748387645675,0 +winogrande,acc,0.5722178374112076,0.013905134013839953,0 diff --git a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-02_5shots_backup.json b/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-02_5shots_backup.json deleted file mode 100644 index 584a7c5454347a629aed59668ac0f4d654bf2372..0000000000000000000000000000000000000000 --- a/4b284b17bc4seed4/evaluation/rankeval/4b284b17bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-02_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.01505026612756444 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.01496596071022448 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404789 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3115193264446996 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4707229635530771, - "acc_stderr": 0.0049812201358823294, - "acc_norm": 0.6292571200955985, - "acc_norm_stderr": 0.004820166002253066 - }, - "rte": { - "acc": 0.5884476534296029, - "acc_stderr": 0.029621832222417196 - }, - "winogrande": { - "acc": 0.5722178374112076, - "acc_stderr": 0.013905134013839953 - }, - "storycloze_2016": { - "acc": 0.7242116515232496, - "acc_stderr": 0.010334748387645675 - }, - "boolq": { - "acc": 0.6269113149847095, - "acc_stderr": 0.008458661252058382 - }, - "arc_easy": { - "acc": 0.6308922558922558, - "acc_stderr": 0.009901987410242738, - "acc_norm": 0.617003367003367, - "acc_norm_stderr": 0.00997492038453648 - }, - "arc_challenge": { - "acc": 0.2909556313993174, - "acc_stderr": 0.01327307786590759, - "acc_norm": 0.318259385665529, - "acc_norm_stderr": 0.013611993916971453 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651526, - "acc_norm": 0.905, - "acc_norm_stderr": 0.0092769101031033 - }, - "piqa": { - "acc": 0.7562568008705114, - "acc_stderr": 0.010017199471500617, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.00987723689513744 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/generation/merged.csv b/4b284b21bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d5b86be0f3b07e212ece1d1a0fbd507c1e5f23f5 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00012949433318118156 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1821558520162302 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1821558520162302 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2160790254378622 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2160790254378622 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.2270812503617402 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.2270812503617402 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.22970482247017235 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.22970482247017235 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2303452786552104 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2303452786552104 +e2e_nlg_cleaned,5,average,multiple,0.1809159538790661 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04522961690526853 +gem_xsum,0,median,rouge2_fmeasure,0.04522961690526853 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0373103830110333 +gem_xsum,1,median,rouge2_fmeasure,0.0373103830110333 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03670452685866085 +gem_xsum,2,median,rouge2_fmeasure,0.03670452685866085 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.035237949376792976 +gem_xsum,3,median,rouge2_fmeasure,0.035237949376792976 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01004804307836075 +gem_xsum,4,median,rouge2_fmeasure,0.01004804307836075 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0007279332281340716 +gem_xsum,5,median,rouge2_fmeasure,0.0007279332281340716 +gem_xsum,5,average,multiple,0.027543075409708413 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.047572357019245066 +web_nlg_en,0,median,rouge2_fmeasure,0.047572357019245066 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0513340131604287 +web_nlg_en,1,median,rouge2_fmeasure,0.0513340131604287 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.054159534529018366 +web_nlg_en,2,median,rouge2_fmeasure,0.054159534529018366 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05434522629151618 +web_nlg_en,3,median,rouge2_fmeasure,0.05434522629151618 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05546852434740693 +web_nlg_en,4,median,rouge2_fmeasure,0.05546852434740693 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05638496281385114 +web_nlg_en,5,median,rouge2_fmeasure,0.05638496281385114 +web_nlg_en,5,average,multiple,0.05321076969357773 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03716728942279169 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03716728942279169 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.0485299590695294 +wiki_lingua_en,1,median,rouge2_fmeasure,0.0485299590695294 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05328365330541802 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05328365330541802 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04324074407846396 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04324074407846396 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01286346881894465 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01286346881894465 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020213111085609944 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0020213111085609944 +wiki_lingua_en,5,average,multiple,0.03285107096728478 diff --git a/4b284b21bc4seed1/evaluation/generation/merged.json b/4b284b21bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..abf5a77434b65390ef8095d308a300db3000a173 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33272479915628694, "bleu_stderr": 0.031796098410938015, "rouge1_fmeasure": 0.09992028750901691, "rouge1_fmeasure_stderr": 0.002068624719484398, "rouge1_precision": 0.06768598529946503, "rouge1_precision_stderr": 0.001826760753754374, "rouge1_recall": 0.273245918626827, "rouge1_recall_stderr": 0.004811539102181524, "rouge2_fmeasure": 0.047572357019245066, "rouge2_fmeasure_stderr": 0.0012754558397189623, "rouge2_precision": 0.0319815601358353, "rouge2_precision_stderr": 0.001123592878248015, "rouge2_recall": 0.132780352400067, "rouge2_recall_stderr": 0.0031651283586402535, "rougeL_fmeasure": 0.09656759025734364, "rougeL_fmeasure_stderr": 0.0019482031642887698, "rougeL_precision": 0.06526863289287767, "rougeL_precision_stderr": 0.00173604911565671, "rougeL_recall": 0.2659429125520888, "rougeL_recall_stderr": 0.004693559342531821, "rougeLsum_fmeasure": 0.09550510540717662, "rougeLsum_fmeasure_stderr": 0.0019437957791240722, "rougeLsum_precision": 0.06475652685202478, "rougeLsum_precision_stderr": 0.001750562854208181, "rougeLsum_recall": 0.261555669425398, "rougeLsum_recall_stderr": 0.004531537846674438}}, "1": {"PALM_prompt": {"bleu": 0.5102447245370249, "bleu_stderr": 0.035376593830697835, "rouge1_fmeasure": 0.11038438449277037, "rouge1_fmeasure_stderr": 0.0019607246874588217, "rouge1_precision": 0.07172631396783712, "rouge1_precision_stderr": 0.0015940647947642393, "rouge1_recall": 0.3498874090683673, "rouge1_recall_stderr": 0.005494538712125141, "rouge2_fmeasure": 0.0513340131604287, "rouge2_fmeasure_stderr": 0.0012161196627700137, "rouge2_precision": 0.03374360598633707, "rouge2_precision_stderr": 0.0011472248278029387, "rouge2_recall": 0.17065910212771632, "rouge2_recall_stderr": 0.0037615577096519314, "rougeL_fmeasure": 0.10363295036370204, "rougeL_fmeasure_stderr": 0.0017764857744676573, "rougeL_precision": 0.06735170136224546, "rougeL_precision_stderr": 0.0014692804169579693, "rougeL_recall": 0.32775015583004236, "rougeL_recall_stderr": 0.005027007036378035, "rougeLsum_fmeasure": 0.10476642693447911, "rougeLsum_fmeasure_stderr": 0.001844638754248624, "rougeLsum_precision": 0.06821168744641012, "rougeLsum_precision_stderr": 0.0015257497030318006, "rougeLsum_recall": 0.3303827812174427, "rougeLsum_recall_stderr": 0.005034171208777432}}, "2": {"PALM_prompt": {"bleu": 0.6028097084858823, "bleu_stderr": 0.03400607345942724, "rouge1_fmeasure": 0.11747286047545925, "rouge1_fmeasure_stderr": 0.0018369017696682527, "rouge1_precision": 0.07447162662420612, "rouge1_precision_stderr": 0.001363119521612849, "rouge1_recall": 0.39133197093760397, "rouge1_recall_stderr": 0.005455286809780116, "rouge2_fmeasure": 0.054159534529018366, "rouge2_fmeasure_stderr": 0.00112831864459488, "rouge2_precision": 0.03418924065620199, "rouge2_precision_stderr": 0.0008008890429324677, "rouge2_recall": 0.19324361036894205, "rouge2_recall_stderr": 0.0038848529810624356, "rougeL_fmeasure": 0.10807779989120546, "rougeL_fmeasure_stderr": 0.001612487862874725, "rougeL_precision": 0.06854684035035577, "rougeL_precision_stderr": 0.0012086804724451806, "rougeL_recall": 0.35968605242390533, "rougeL_recall_stderr": 0.004868740565794037, "rougeLsum_fmeasure": 0.11132296857949775, "rougeLsum_fmeasure_stderr": 0.0017256589790344656, "rougeLsum_precision": 0.0706586243525616, "rougeLsum_precision_stderr": 0.0012889308612371538, "rougeLsum_recall": 0.36945068896305344, "rougeLsum_recall_stderr": 0.0050409156545949945}}, "3": {"PALM_prompt": {"bleu": 0.701781042691669, "bleu_stderr": 0.028304269361164428, "rouge1_fmeasure": 0.1186720883935396, "rouge1_fmeasure_stderr": 0.0018557604964665015, "rouge1_precision": 0.07483297992459019, "rouge1_precision_stderr": 0.0013297778153553014, "rouge1_recall": 0.40075245309368346, "rouge1_recall_stderr": 0.005510805436391809, "rouge2_fmeasure": 0.05434522629151618, "rouge2_fmeasure_stderr": 0.0011369092672670533, "rouge2_precision": 0.03405829473639484, "rouge2_precision_stderr": 0.0007867378650100502, "rouge2_recall": 0.1965036741041513, "rouge2_recall_stderr": 0.0039013830928258273, "rougeL_fmeasure": 0.10798420755086514, "rougeL_fmeasure_stderr": 0.001585937192657532, "rougeL_precision": 0.06814102346988, "rougeL_precision_stderr": 0.001142595819228343, "rougeL_recall": 0.36375561641075393, "rougeL_recall_stderr": 0.004784629751764729, "rougeLsum_fmeasure": 0.11180266475563037, "rougeLsum_fmeasure_stderr": 0.0017161756564434687, "rougeLsum_precision": 0.0705542561432147, "rougeLsum_precision_stderr": 0.0012322784221801353, "rougeLsum_recall": 0.376368921472397, "rougeLsum_recall_stderr": 0.005042206855487164}}, "4": {"PALM_prompt": {"bleu": 0.7208477776174064, "bleu_stderr": 0.03611939053214278, "rouge1_fmeasure": 0.12065545317279747, "rouge1_fmeasure_stderr": 0.0017878857789550631, "rouge1_precision": 0.07573522475977945, "rouge1_precision_stderr": 0.0012828287982675156, "rouge1_recall": 0.4140707617730325, "rouge1_recall_stderr": 0.005500874400250013, "rouge2_fmeasure": 0.05546852434740693, "rouge2_fmeasure_stderr": 0.001093391566003447, "rouge2_precision": 0.03464259052791206, "rouge2_precision_stderr": 0.0007595549203652654, "rouge2_recall": 0.20555557401100102, "rouge2_recall_stderr": 0.003919174647287531, "rougeL_fmeasure": 0.10954384971885704, "rougeL_fmeasure_stderr": 0.001541389309718117, "rougeL_precision": 0.06882833861974198, "rougeL_precision_stderr": 0.00110954922342777, "rougeL_recall": 0.37506782401870103, "rougeL_recall_stderr": 0.004831464763832598, "rougeLsum_fmeasure": 0.11373501038504552, "rougeLsum_fmeasure_stderr": 0.0016710119042870925, "rougeLsum_precision": 0.07144735748031365, "rougeLsum_precision_stderr": 0.0011999248890901264, "rougeLsum_recall": 0.3887273820866732, "rougeLsum_recall_stderr": 0.005052183598911252}}, "5": {"PALM_prompt": {"bleu": 0.8098840843933562, "bleu_stderr": 0.042922709276963275, "rouge1_fmeasure": 0.12112590895694009, "rouge1_fmeasure_stderr": 0.0016877211094516577, "rouge1_precision": 0.07551077098651218, "rouge1_precision_stderr": 0.0011967463119907174, "rouge1_recall": 0.42778952645041607, "rouge1_recall_stderr": 0.005509208476502683, "rouge2_fmeasure": 0.05638496281385114, "rouge2_fmeasure_stderr": 0.0010526694616578302, "rouge2_precision": 0.03488380229207791, "rouge2_precision_stderr": 0.0007164646591878178, "rouge2_recall": 0.21683909270039745, "rouge2_recall_stderr": 0.0040700970178128, "rougeL_fmeasure": 0.1091505948569381, "rougeL_fmeasure_stderr": 0.0014719382275165554, "rougeL_precision": 0.06816852013471421, "rougeL_precision_stderr": 0.0010570419074259674, "rougeL_recall": 0.3851255216729155, "rougeL_recall_stderr": 0.004804722289936786, "rougeLsum_fmeasure": 0.1139947508286874, "rougeLsum_fmeasure_stderr": 0.0015813085492093937, "rougeLsum_precision": 0.07111730496416463, "rougeLsum_precision_stderr": 0.0011271521292987115, "rougeLsum_recall": 0.40225015185589585, "rougeLsum_recall_stderr": 0.005071744746719108}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.604770833205388, "bleu_stderr": 0.045152527476120426, "rouge1_fmeasure": 0.17820031429089941, "rouge1_fmeasure_stderr": 0.001867331386262004, "rouge1_precision": 0.15167532178589505, "rouge1_precision_stderr": 0.0019073207185280718, "rouge1_recall": 0.2612333406333799, "rouge1_recall_stderr": 0.0026793907173360246, "rouge2_fmeasure": 0.03716728942279169, "rouge2_fmeasure_stderr": 0.0008623486309823396, "rouge2_precision": 0.03148570054286659, "rouge2_precision_stderr": 0.0007720646786309003, "rouge2_recall": 0.05626578766960683, "rouge2_recall_stderr": 0.0014278552081629383, "rougeL_fmeasure": 0.13833033412198834, "rougeL_fmeasure_stderr": 0.0013282918104537548, "rougeL_precision": 0.116140614745089, "rougeL_precision_stderr": 0.0013156638447678107, "rougeL_recall": 0.2083623246576857, "rougeL_recall_stderr": 0.0021996516173952707, "rougeLsum_fmeasure": 0.16463728733959723, "rougeLsum_fmeasure_stderr": 0.0017151648189316772, "rougeLsum_precision": 0.13994812048897234, "rougeLsum_precision_stderr": 0.0017518520260318953, "rougeLsum_recall": 0.2422441673257693, "rougeLsum_recall_stderr": 0.0024926109810662115}}, "1": {"tldr_en": {"bleu": 2.3579628914986084, "bleu_stderr": 0.06443336501226987, "rouge1_fmeasure": 0.20675601444211894, "rouge1_fmeasure_stderr": 0.0019509220985923807, "rouge1_precision": 0.17995455895644027, "rouge1_precision_stderr": 0.0021143646937022468, "rouge1_recall": 0.29822507703809786, "rouge1_recall_stderr": 0.0029202005379262794, "rouge2_fmeasure": 0.0485299590695294, "rouge2_fmeasure_stderr": 0.000987435381157907, "rouge2_precision": 0.04280457785161096, "rouge2_precision_stderr": 0.0010705605908791504, "rouge2_recall": 0.07256480707864589, "rouge2_recall_stderr": 0.0016883207002489, "rougeL_fmeasure": 0.14838617751760952, "rougeL_fmeasure_stderr": 0.0013193585886492735, "rougeL_precision": 0.128484042233791, "rougeL_precision_stderr": 0.0014903515496972157, "rougeL_recall": 0.21893094707653085, "rougeL_recall_stderr": 0.0022685657736807928, "rougeLsum_fmeasure": 0.19330419331545362, "rougeLsum_fmeasure_stderr": 0.001817315871702104, "rougeLsum_precision": 0.1682253075781992, "rougeLsum_precision_stderr": 0.001983258112076011, "rougeLsum_recall": 0.27945450640399006, "rougeLsum_recall_stderr": 0.002751044005784169}}, "2": {"tldr_en": {"bleu": 2.770701026929389, "bleu_stderr": 0.055755441860674826, "rouge1_fmeasure": 0.21447969004311607, "rouge1_fmeasure_stderr": 0.0018895718255246762, "rouge1_precision": 0.1942548807242778, "rouge1_precision_stderr": 0.0023454682458510967, "rouge1_recall": 0.3038516433424921, "rouge1_recall_stderr": 0.0028161999694894368, "rouge2_fmeasure": 0.05328365330541802, "rouge2_fmeasure_stderr": 0.0010288709039921085, "rouge2_precision": 0.0492948803130678, "rouge2_precision_stderr": 0.0011990391917610706, "rouge2_recall": 0.0782436349256461, "rouge2_recall_stderr": 0.0017608820777292064, "rougeL_fmeasure": 0.15374474737766516, "rougeL_fmeasure_stderr": 0.0013038739578348723, "rougeL_precision": 0.1390228041509367, "rougeL_precision_stderr": 0.0017234767147472074, "rougeL_recall": 0.22270032621786837, "rougeL_recall_stderr": 0.002252792096654515, "rougeLsum_fmeasure": 0.2018034121125251, "rougeLsum_fmeasure_stderr": 0.0017698133555769187, "rougeLsum_precision": 0.18265776299811193, "rougeLsum_precision_stderr": 0.0022101445170749252, "rougeLsum_recall": 0.28667373859931605, "rougeLsum_recall_stderr": 0.002687218604461664}}, "3": {"tldr_en": {"bleu": 2.6943492797886797, "bleu_stderr": 0.048945628141051856, "rouge1_fmeasure": 0.17584661717144687, "rouge1_fmeasure_stderr": 0.0022080154978502656, "rouge1_precision": 0.16605472663049203, "rouge1_precision_stderr": 0.0026215736776626004, "rouge1_recall": 0.24674896622444079, "rouge1_recall_stderr": 0.0032942062409122403, "rouge2_fmeasure": 0.04324074407846396, "rouge2_fmeasure_stderr": 0.001030508940585308, "rouge2_precision": 0.041252526415028114, "rouge2_precision_stderr": 0.0012562811469129071, "rouge2_recall": 0.06385056201459363, "rouge2_recall_stderr": 0.0017227611606126488, "rougeL_fmeasure": 0.12719556123913905, "rougeL_fmeasure_stderr": 0.0015628923302645156, "rougeL_precision": 0.12107072019854664, "rougeL_precision_stderr": 0.0020206323069935596, "rougeL_recall": 0.18199308221887076, "rougeL_recall_stderr": 0.0025743010221870987, "rougeLsum_fmeasure": 0.166023927031211, "rougeLsum_fmeasure_stderr": 0.0020868492894240698, "rougeLsum_precision": 0.15696755228304862, "rougeLsum_precision_stderr": 0.0025037796725520395, "rougeLsum_recall": 0.23345046038796602, "rougeLsum_recall_stderr": 0.003144339747645053}}, "4": {"tldr_en": {"bleu": 0.5943697556411244, "bleu_stderr": 0.030734589500314064, "rouge1_fmeasure": 0.05454947380142532, "rouge1_fmeasure_stderr": 0.001847757669359123, "rouge1_precision": 0.05342346190272065, "rouge1_precision_stderr": 0.002056550316894281, "rouge1_recall": 0.07978774494235341, "rouge1_recall_stderr": 0.0028101467900972705, "rouge2_fmeasure": 0.01286346881894465, "rouge2_fmeasure_stderr": 0.0006471740657104415, "rouge2_precision": 0.012258350990130428, "rouge2_precision_stderr": 0.0007138900312061555, "rouge2_recall": 0.02028789716505487, "rouge2_recall_stderr": 0.0011428305900431244, "rougeL_fmeasure": 0.0407204750127799, "rougeL_fmeasure_stderr": 0.0013676744674585327, "rougeL_precision": 0.040366367324606266, "rougeL_precision_stderr": 0.0016113191055682687, "rougeL_recall": 0.06061271598925825, "rougeL_recall_stderr": 0.0021836846333784096, "rougeLsum_fmeasure": 0.05108474776360767, "rougeLsum_fmeasure_stderr": 0.0017328648234254131, "rougeLsum_precision": 0.050133371793595845, "rougeLsum_precision_stderr": 0.0019436034391785793, "rougeLsum_recall": 0.07473111376648439, "rougeLsum_recall_stderr": 0.0026329914241005055}}, "5": {"tldr_en": {"bleu": 1.5595918572619473e-06, "bleu_stderr": 2.8487471872210672e-06, "rouge1_fmeasure": 0.008385094317414335, "rouge1_fmeasure_stderr": 0.0007962608828602142, "rouge1_precision": 0.008364381665919872, "rouge1_precision_stderr": 0.0008819285241796705, "rouge1_recall": 0.012634998947429131, "rouge1_recall_stderr": 0.0012520411703047264, "rouge2_fmeasure": 0.0020213111085609944, "rouge2_fmeasure_stderr": 0.0002694978086459766, "rouge2_precision": 0.0019514480000772428, "rouge2_precision_stderr": 0.0002950485643713033, "rouge2_recall": 0.0034586914827538628, "rouge2_recall_stderr": 0.000553626619531344, "rougeL_fmeasure": 0.006545584095103677, "rougeL_fmeasure_stderr": 0.0006266707040781551, "rougeL_precision": 0.006554032979312583, "rougeL_precision_stderr": 0.0006990429427874044, "rougeL_recall": 0.010080116076976064, "rougeL_recall_stderr": 0.0010433208473045565, "rougeLsum_fmeasure": 0.007958529393460171, "rougeLsum_fmeasure_stderr": 0.0007541972102063722, "rougeLsum_precision": 0.007976144780102256, "rougeLsum_precision_stderr": 0.0008419903349386873, "rougeLsum_recall": 0.012040227491122851, "rougeLsum_recall_stderr": 0.001201302499319246}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.01599277764818925, "bleu_stderr": 0.004308927933825235, "rouge1_fmeasure": 0.016178489257483206, "rouge1_fmeasure_stderr": 0.0003336557540954231, "rouge1_precision": 0.012870158730159176, "rouge1_precision_stderr": 0.00026604075009303314, "rouge1_recall": 0.02306192594226545, "rouge1_recall_stderr": 0.0005060585834775515, "rouge2_fmeasure": 0.00012949433318118156, "rouge2_fmeasure_stderr": 3.494334474550083e-05, "rouge2_precision": 0.00011382113821138212, "rouge2_precision_stderr": 3.0353975217624302e-05, "rouge2_recall": 0.00016007109878338666, "rouge2_recall_stderr": 4.471587603586543e-05, "rougeL_fmeasure": 0.016178489257483206, "rougeL_fmeasure_stderr": 0.0003336557540954231, "rougeL_precision": 0.012870158730159176, "rougeL_precision_stderr": 0.00026604075009303314, "rougeL_recall": 0.02306192594226545, "rougeL_recall_stderr": 0.0005060585834775515, "rougeLsum_fmeasure": 0.015580467425438196, "rougeLsum_fmeasure_stderr": 0.0003114685494407546, "rougeLsum_precision": 0.012401587301587743, "rougeLsum_precision_stderr": 0.0002495014907480676, "rougeLsum_recall": 0.022186928253083223, "rougeLsum_recall_stderr": 0.00046927560289039864}}, "1": {"generate_text_restaurant": {"bleu": 9.29730492833567, "bleu_stderr": 0.07258438562915104, "rouge1_fmeasure": 0.40179356473852385, "rouge1_fmeasure_stderr": 0.0024053635164423257, "rouge1_precision": 0.4508609495206719, "rouge1_precision_stderr": 0.0036311478842959532, "rouge1_recall": 0.4256279851535847, "rouge1_recall_stderr": 0.002998743529748849, "rouge2_fmeasure": 0.1821558520162302, "rouge2_fmeasure_stderr": 0.0018839477104031566, "rouge2_precision": 0.20803358909759323, "rouge2_precision_stderr": 0.002540052178876273, "rouge2_recall": 0.19234516196309973, "rouge2_recall_stderr": 0.0021041392938787457, "rougeL_fmeasure": 0.3001108360092531, "rougeL_fmeasure_stderr": 0.0019304143016397508, "rougeL_precision": 0.33647256551604726, "rougeL_precision_stderr": 0.0029522675564527113, "rougeL_recall": 0.32133682780546957, "rougeL_recall_stderr": 0.0025106371554624155, "rougeLsum_fmeasure": 0.3272696975463691, "rougeLsum_fmeasure_stderr": 0.002280428848910135, "rougeLsum_precision": 0.36851923372760476, "rougeLsum_precision_stderr": 0.0033065014951933927, "rougeLsum_recall": 0.3458837990441542, "rougeLsum_recall_stderr": 0.0027230707350231853}}, "2": {"generate_text_restaurant": {"bleu": 12.759775802861823, "bleu_stderr": 0.1293139727386451, "rouge1_fmeasure": 0.4496590978589395, "rouge1_fmeasure_stderr": 0.002303602978907222, "rouge1_precision": 0.5296073064762413, "rouge1_precision_stderr": 0.0033783994941725144, "rouge1_recall": 0.4363790698973518, "rouge1_recall_stderr": 0.0029267544023528365, "rouge2_fmeasure": 0.2160790254378622, "rouge2_fmeasure_stderr": 0.001955317829071995, "rouge2_precision": 0.25851059818334, "rouge2_precision_stderr": 0.0026408591114483963, "rouge2_recall": 0.20978543572844377, "rouge2_recall_stderr": 0.002146572930760848, "rougeL_fmeasure": 0.33140579758047195, "rougeL_fmeasure_stderr": 0.0020369186351820263, "rougeL_precision": 0.39232032536879197, "rougeL_precision_stderr": 0.0029964581011695346, "rougeL_recall": 0.3219131393183246, "rougeL_recall_stderr": 0.0024670087975040625, "rougeLsum_fmeasure": 0.3703216404379091, "rougeLsum_fmeasure_stderr": 0.0022909574817028392, "rougeLsum_precision": 0.4374911840880169, "rougeLsum_precision_stderr": 0.0032563934045941744, "rougeLsum_recall": 0.35889049112539584, "rougeLsum_recall_stderr": 0.0027163755824492135}}, "3": {"generate_text_restaurant": {"bleu": 13.39712229435215, "bleu_stderr": 0.2190368432916537, "rouge1_fmeasure": 0.4585328203108971, "rouge1_fmeasure_stderr": 0.0022516911107973987, "rouge1_precision": 0.5434706515030117, "rouge1_precision_stderr": 0.0032446983744738762, "rouge1_recall": 0.43747355273891847, "rouge1_recall_stderr": 0.0028993863830845397, "rouge2_fmeasure": 0.2270812503617402, "rouge2_fmeasure_stderr": 0.0019758386181450292, "rouge2_precision": 0.27356714226869305, "rouge2_precision_stderr": 0.0026390580957758175, "rouge2_recall": 0.21639197788244496, "rouge2_recall_stderr": 0.0021805025438367607, "rougeL_fmeasure": 0.34028277436237736, "rougeL_fmeasure_stderr": 0.0020545641994681512, "rougeL_precision": 0.40618251147113754, "rougeL_precision_stderr": 0.0029740816706799833, "rougeL_recall": 0.3236014788599963, "rougeL_recall_stderr": 0.0024273522496127556, "rougeLsum_fmeasure": 0.38125456908954924, "rougeLsum_fmeasure_stderr": 0.002257169022713513, "rougeLsum_precision": 0.4532770963097311, "rougeLsum_precision_stderr": 0.0031647100865919084, "rougeLsum_recall": 0.36298447462453876, "rougeLsum_recall_stderr": 0.0026887391183781422}}, "4": {"generate_text_restaurant": {"bleu": 13.278538903088437, "bleu_stderr": 0.19304270040523466, "rouge1_fmeasure": 0.4614844218085089, "rouge1_fmeasure_stderr": 0.0021663747761732207, "rouge1_precision": 0.5550942770770999, "rouge1_precision_stderr": 0.00321920680863295, "rouge1_recall": 0.43254174724244665, "rouge1_recall_stderr": 0.0027648654319805638, "rouge2_fmeasure": 0.22970482247017235, "rouge2_fmeasure_stderr": 0.00197438239706358, "rouge2_precision": 0.28123246363040955, "rouge2_precision_stderr": 0.0027007119359494314, "rouge2_recall": 0.2150097554160261, "rouge2_recall_stderr": 0.002138899675268787, "rougeL_fmeasure": 0.3427415979760685, "rougeL_fmeasure_stderr": 0.0020357821149855123, "rougeL_precision": 0.4143281149202266, "rougeL_precision_stderr": 0.002969403717017417, "rougeL_recall": 0.32090483861163693, "rougeL_recall_stderr": 0.0023977607834718777, "rougeLsum_fmeasure": 0.3847530991565415, "rougeLsum_fmeasure_stderr": 0.0022220842320301534, "rougeLsum_precision": 0.46330716929389093, "rougeLsum_precision_stderr": 0.0031399338067163917, "rougeLsum_recall": 0.3607474288264691, "rougeLsum_recall_stderr": 0.002652994557246981}}, "5": {"generate_text_restaurant": {"bleu": 13.349674798491353, "bleu_stderr": 0.2620211666324464, "rouge1_fmeasure": 0.46359996980874846, "rouge1_fmeasure_stderr": 0.0021412883557770652, "rouge1_precision": 0.5580977741060988, "rouge1_precision_stderr": 0.003152695337297477, "rouge1_recall": 0.4318929339624608, "rouge1_recall_stderr": 0.0027232610944967796, "rouge2_fmeasure": 0.2303452786552104, "rouge2_fmeasure_stderr": 0.0019438168093482682, "rouge2_precision": 0.282134536476671, "rouge2_precision_stderr": 0.0026354539351566403, "rouge2_recall": 0.21417820926964778, "rouge2_recall_stderr": 0.0020891541191163186, "rougeL_fmeasure": 0.34414481054807766, "rougeL_fmeasure_stderr": 0.0020332629423129916, "rougeL_precision": 0.416156553657584, "rougeL_precision_stderr": 0.002921110787140179, "rougeL_recall": 0.3201090740050369, "rougeL_recall_stderr": 0.0023555592018131553, "rougeLsum_fmeasure": 0.38760985028639305, "rougeLsum_fmeasure_stderr": 0.002239740316941447, "rougeLsum_precision": 0.46739254821741905, "rougeLsum_precision_stderr": 0.0031264669970885233, "rougeLsum_recall": 0.3608544692904568, "rougeLsum_recall_stderr": 0.0026213087830254975}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8758881711718112, "bleu_stderr": 0.06414775170791744, "rouge1_fmeasure": 0.20682985522235142, "rouge1_fmeasure_stderr": 0.002563714031203911, "rouge1_precision": 0.15826471323077757, "rouge1_precision_stderr": 0.002220888825623518, "rouge1_recall": 0.33446079519909305, "rouge1_recall_stderr": 0.004514327805033082, "rouge2_fmeasure": 0.04522961690526853, "rouge2_fmeasure_stderr": 0.0015798221742638174, "rouge2_precision": 0.03352875592509948, "rouge2_precision_stderr": 0.0012249216700579001, "rouge2_recall": 0.07724702868679346, "rouge2_recall_stderr": 0.002831153557559276, "rougeL_fmeasure": 0.1540627639559492, "rougeL_fmeasure_stderr": 0.0019062590295789597, "rougeL_precision": 0.1175738942721655, "rougeL_precision_stderr": 0.0016420722284867819, "rougeL_recall": 0.2512371460248057, "rougeL_recall_stderr": 0.003517155084540941, "rougeLsum_fmeasure": 0.16037094135484917, "rougeLsum_fmeasure_stderr": 0.0021767207134326364, "rougeLsum_precision": 0.12200871250100262, "rougeLsum_precision_stderr": 0.0017899421156850875, "rougeLsum_recall": 0.26225586640138876, "rougeLsum_recall_stderr": 0.0040169777118884645}}, "1": {"article_DOC_summary": {"bleu": 1.4877593210481137, "bleu_stderr": 0.0786306768078736, "rouge1_fmeasure": 0.17974510059897553, "rouge1_fmeasure_stderr": 0.002465987111038623, "rouge1_precision": 0.12818817016431805, "rouge1_precision_stderr": 0.0018328880795144842, "rouge1_recall": 0.3134061916653512, "rouge1_recall_stderr": 0.004249929100773614, "rouge2_fmeasure": 0.0373103830110333, "rouge2_fmeasure_stderr": 0.0014354470060085698, "rouge2_precision": 0.026250493846268503, "rouge2_precision_stderr": 0.0010076577262875675, "rouge2_recall": 0.06742089722818025, "rouge2_recall_stderr": 0.002693226728725516, "rougeL_fmeasure": 0.14142245724497754, "rougeL_fmeasure_stderr": 0.001878295390484601, "rougeL_precision": 0.10061028549996975, "rougeL_precision_stderr": 0.0013774691842900675, "rougeL_recall": 0.24845939880798545, "rougeL_recall_stderr": 0.003416347548558214, "rougeLsum_fmeasure": 0.14274126658034353, "rougeLsum_fmeasure_stderr": 0.0020267265379362424, "rougeLsum_precision": 0.10150911522087737, "rougeLsum_precision_stderr": 0.0014800824882477777, "rougeLsum_recall": 0.2508723286793532, "rougeLsum_recall_stderr": 0.003661797386209274}}, "2": {"article_DOC_summary": {"bleu": 1.5028153336333563, "bleu_stderr": 0.07627640892766649, "rouge1_fmeasure": 0.17878904396689269, "rouge1_fmeasure_stderr": 0.00243376415334862, "rouge1_precision": 0.1272806924752953, "rouge1_precision_stderr": 0.0018087494542713912, "rouge1_recall": 0.3130608961268874, "rouge1_recall_stderr": 0.0041974862144461686, "rouge2_fmeasure": 0.03670452685866085, "rouge2_fmeasure_stderr": 0.0014016808833113095, "rouge2_precision": 0.02583921544659326, "rouge2_precision_stderr": 0.0009884044996505668, "rouge2_recall": 0.06617476951559573, "rouge2_recall_stderr": 0.0026088727674303245, "rougeL_fmeasure": 0.1413735388750044, "rougeL_fmeasure_stderr": 0.0018450620692794725, "rougeL_precision": 0.10041567883791579, "rougeL_precision_stderr": 0.0013584120767201249, "rougeL_recall": 0.24945606421737898, "rougeL_recall_stderr": 0.0033610500271998743, "rougeLsum_fmeasure": 0.1419412969578001, "rougeLsum_fmeasure_stderr": 0.0019890982108612416, "rougeLsum_precision": 0.10073064727294082, "rougeLsum_precision_stderr": 0.0014545958847864858, "rougeLsum_recall": 0.2508016791258512, "rougeLsum_recall_stderr": 0.003602192602128567}}, "3": {"article_DOC_summary": {"bleu": 1.4368429875503554, "bleu_stderr": 0.06985209662692778, "rouge1_fmeasure": 0.17306626516595003, "rouge1_fmeasure_stderr": 0.002606230170311302, "rouge1_precision": 0.12623940374443, "rouge1_precision_stderr": 0.0020446872317856236, "rouge1_recall": 0.2970674436926099, "rouge1_recall_stderr": 0.004510700325635104, "rouge2_fmeasure": 0.035237949376792976, "rouge2_fmeasure_stderr": 0.0013859609116877547, "rouge2_precision": 0.025227030531171757, "rouge2_precision_stderr": 0.00099984133264785, "rouge2_recall": 0.062156816006725055, "rouge2_recall_stderr": 0.0025180978338057826, "rougeL_fmeasure": 0.1364033778870308, "rougeL_fmeasure_stderr": 0.0019668305545097802, "rougeL_precision": 0.09928754777119025, "rougeL_precision_stderr": 0.0015425645112311069, "rougeL_recall": 0.23556252311929232, "rougeL_recall_stderr": 0.003514098352558134, "rougeLsum_fmeasure": 0.13711597851384225, "rougeLsum_fmeasure_stderr": 0.002115039571938244, "rougeLsum_precision": 0.09980705574148897, "rougeLsum_precision_stderr": 0.0016450051443727947, "rougeLsum_recall": 0.23669283330652355, "rougeLsum_recall_stderr": 0.0037461804498405958}}, "4": {"article_DOC_summary": {"bleu": 0.6717545873809246, "bleu_stderr": 0.11330796657627773, "rouge1_fmeasure": 0.048983292472562175, "rouge1_fmeasure_stderr": 0.0027644367309486977, "rouge1_precision": 0.04214614218832052, "rouge1_precision_stderr": 0.0026107746281583345, "rouge1_recall": 0.07655987772910962, "rouge1_recall_stderr": 0.004452578402829074, "rouge2_fmeasure": 0.01004804307836075, "rouge2_fmeasure_stderr": 0.0009374479444550356, "rouge2_precision": 0.007766000425718809, "rouge2_precision_stderr": 0.0007712139973537835, "rouge2_recall": 0.016794467199208594, "rouge2_recall_stderr": 0.0015960487560251384, "rougeL_fmeasure": 0.03916437496912562, "rougeL_fmeasure_stderr": 0.0021783828707728175, "rougeL_precision": 0.03417705858833514, "rougeL_precision_stderr": 0.002144817166226441, "rougeL_recall": 0.0611728887335129, "rougeL_recall_stderr": 0.003535855187265437, "rougeLsum_fmeasure": 0.03924383294112677, "rougeLsum_fmeasure_stderr": 0.002217328487694054, "rougeLsum_precision": 0.034323144988876814, "rougeLsum_precision_stderr": 0.0021857348636160957, "rougeLsum_recall": 0.06145859851518466, "rougeLsum_recall_stderr": 0.003628835438679055}}, "5": {"article_DOC_summary": {"bleu": 2.1058893408297867e-36, "bleu_stderr": 2.1724367476442414e-31, "rouge1_fmeasure": 0.0032528851804285172, "rouge1_fmeasure_stderr": 0.0008749082661343398, "rouge1_precision": 0.0037186815835539475, "rouge1_precision_stderr": 0.0010258999278355571, "rouge1_recall": 0.002971904217615499, "rouge1_recall_stderr": 0.0007902202653735375, "rouge2_fmeasure": 0.0007279332281340716, "rouge2_fmeasure_stderr": 0.0002925088475825617, "rouge2_precision": 0.0008463719964829843, "rouge2_precision_stderr": 0.00034301989930096845, "rouge2_recall": 0.000656010656010656, "rouge2_recall_stderr": 0.0002674722049726138, "rougeL_fmeasure": 0.0025265679608804242, "rougeL_fmeasure_stderr": 0.0006801945500560441, "rougeL_precision": 0.002842770899097203, "rougeL_precision_stderr": 0.0007736763310592724, "rougeL_recall": 0.0023409347627226925, "rougeL_recall_stderr": 0.000630308860618993, "rougeLsum_fmeasure": 0.002627465953010381, "rougeLsum_fmeasure_stderr": 0.0007127898651991914, "rougeLsum_precision": 0.002946821953481221, "rougeLsum_precision_stderr": 0.0008038704154971681, "rougeLsum_recall": 0.00243903003284904, "rougeLsum_recall_stderr": 0.0006639365437911874}}}} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..59990cd71e38de1fed1f5c2015afb82a79b17182 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811482,0 +anli_r2,acc,0.324,0.014806864733738857,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.295221843003413,0.013329750293382316,0 +arc_easy,acc,0.6018518518518519,0.010044662374653396,0 +arc_easy,acc_norm,0.5286195286195287,0.010242962617927197,0 +boolq,acc,0.6220183486238532,0.008480656964585246,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.36493558776167473,,1 +copa,acc,0.77,0.042295258468165065,0 +hellaswag,acc,0.4695279824736108,0.004980506329407586,0 +hellaswag,acc_norm,0.6156144194383589,0.0048545552940175395,0 +piqa,acc,0.750272034820457,0.010099232969867483,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.832,0.011828605831454267,0 +sciq,acc_norm,0.751,0.013681600278702296,0 +storycloze_2016,acc,0.7199358631747729,0.01038376499392048,0 +winogrande,acc,0.5832675611681136,0.013856250072796316,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json deleted file mode 100644 index a3bf8e3018b0efbc3058006e428aca902ed2a2ea..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811482 - }, - "anli_r2": { - "acc": 0.324, - "acc_stderr": 0.014806864733738857 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.36493558776167473 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.042295258468165065 - }, - "hellaswag": { - "acc": 0.4695279824736108, - "acc_stderr": 0.004980506329407586, - "acc_norm": 0.6156144194383589, - "acc_norm_stderr": 0.0048545552940175395 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5832675611681136, - "acc_stderr": 0.013856250072796316 - }, - "storycloze_2016": { - "acc": 0.7199358631747729, - "acc_stderr": 0.01038376499392048 - }, - "boolq": { - "acc": 0.6220183486238532, - "acc_stderr": 0.008480656964585246 - }, - "arc_easy": { - "acc": 0.6018518518518519, - "acc_stderr": 0.010044662374653396, - "acc_norm": 0.5286195286195287, - "acc_norm_stderr": 0.010242962617927197 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.013329750293382316 - }, - "sciq": { - "acc": 0.832, - "acc_stderr": 0.011828605831454267, - "acc_norm": 0.751, - "acc_norm_stderr": 0.013681600278702296 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867483, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..fd172eec27351a0f4799c45109a5bb1430853fb4 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.014770821817934642,0 +anli_r2,acc,0.321,0.014770821817934647,0 +anli_r3,acc,0.3333333333333333,0.013613950010225603,0 +arc_challenge,acc,0.28668941979522183,0.013214986329274762,0 +arc_challenge,acc_norm,0.3037542662116041,0.013438909184778759,0 +arc_easy,acc,0.6043771043771043,0.010033741393430986,0 +arc_easy,acc_norm,0.5686026936026936,0.010162752847747501,0 +boolq,acc,0.5773700305810398,0.008639722698719023,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.35714285714285704,,1 +copa,acc,0.74,0.04408440022768079,0 +hellaswag,acc,0.4691296554471221,0.004980262025472478,0 +hellaswag,acc_norm,0.6161123282214698,0.004853371646239242,0 +piqa,acc,0.7519042437431991,0.010077118315574715,0 +piqa,acc_norm,0.7606093579978237,0.00995588425029168,0 +rte,acc,0.5667870036101083,0.02982676408213828,0 +sciq,acc,0.883,0.010169287802713329,0 +sciq,acc_norm,0.859,0.011010914595992445,0 +storycloze_2016,acc,0.721004810261892,0.010371620932652793,0 +winogrande,acc,0.5651144435674822,0.013932814110418025,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json deleted file mode 100644 index b63df694ea2ddfdec3ea8fdec7e22637cc7bc4d5..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.014770821817934642 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.014770821817934647 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.013613950010225603 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.35714285714285704 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768079 - }, - "hellaswag": { - "acc": 0.4691296554471221, - "acc_stderr": 0.004980262025472478, - "acc_norm": 0.6161123282214698, - "acc_norm_stderr": 0.004853371646239242 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.02982676408213828 - }, - "winogrande": { - "acc": 0.5651144435674822, - "acc_stderr": 0.013932814110418025 - }, - "storycloze_2016": { - "acc": 0.721004810261892, - "acc_stderr": 0.010371620932652793 - }, - "boolq": { - "acc": 0.5773700305810398, - "acc_stderr": 0.008639722698719023 - }, - "arc_easy": { - "acc": 0.6043771043771043, - "acc_stderr": 0.010033741393430986, - "acc_norm": 0.5686026936026936, - "acc_norm_stderr": 0.010162752847747501 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.013214986329274762, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.013438909184778759 - }, - "sciq": { - "acc": 0.883, - "acc_stderr": 0.010169287802713329, - "acc_norm": 0.859, - "acc_norm_stderr": 0.011010914595992445 - }, - "piqa": { - "acc": 0.7519042437431991, - "acc_stderr": 0.010077118315574715, - "acc_norm": 0.7606093579978237, - "acc_norm_stderr": 0.00995588425029168 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..a9c92274d79404f90026eb61969a96b6d9d6315e --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095524,0 +anli_r2,acc,0.322,0.014782913600996678,0 +anli_r3,acc,0.335,0.013630871843821476,0 +arc_challenge,acc,0.2883959044368601,0.013238394422428164,0 +arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0 +arc_easy,acc,0.6203703703703703,0.009958037725468575,0 +arc_easy,acc_norm,0.5959595959595959,0.010069061649549549,0 +boolq,acc,0.5923547400611621,0.008594580270731615,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.3018475149622691,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4645488946425015,0.004977223485342027,0 +hellaswag,acc_norm,0.6139215295757817,0.004858539527872464,0 +piqa,acc,0.750272034820457,0.010099232969867486,0 +piqa,acc_norm,0.750816104461371,0.01009188277012021,0 +rte,acc,0.5595667870036101,0.029882123363118712,0 +sciq,acc,0.893,0.009779910359847167,0 +sciq,acc_norm,0.879,0.010318210380946088,0 +storycloze_2016,acc,0.721004810261892,0.010371620932652793,0 +winogrande,acc,0.579321231254933,0.013874526372008327,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json deleted file mode 100644 index 34b9b1a2a6ac3eb6c3454952b8b6cceefa32a1c5..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095524 - }, - "anli_r2": { - "acc": 0.322, - "acc_stderr": 0.014782913600996678 - }, - "anli_r3": { - "acc": 0.335, - "acc_stderr": 0.013630871843821476 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.3018475149622691 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4645488946425015, - "acc_stderr": 0.004977223485342027, - "acc_norm": 0.6139215295757817, - "acc_norm_stderr": 0.004858539527872464 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118712 - }, - "winogrande": { - "acc": 0.579321231254933, - "acc_stderr": 0.013874526372008327 - }, - "storycloze_2016": { - "acc": 0.721004810261892, - "acc_stderr": 0.010371620932652793 - }, - "boolq": { - "acc": 0.5923547400611621, - "acc_stderr": 0.008594580270731615 - }, - "arc_easy": { - "acc": 0.6203703703703703, - "acc_stderr": 0.009958037725468575, - "acc_norm": 0.5959595959595959, - "acc_norm_stderr": 0.010069061649549549 - }, - "arc_challenge": { - "acc": 0.2883959044368601, - "acc_stderr": 0.013238394422428164, - "acc_norm": 0.302901023890785, - "acc_norm_stderr": 0.013428241573185349 - }, - "sciq": { - "acc": 0.893, - "acc_stderr": 0.009779910359847167, - "acc_norm": 0.879, - "acc_norm_stderr": 0.010318210380946088 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867486, - "acc_norm": 0.750816104461371, - "acc_norm_stderr": 0.01009188277012021 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..94611aac83a8af1cd28175e91fb4f7c223bddec2 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.33,0.01487687202745673,0 +anli_r3,acc,0.32916666666666666,0.01357080625843363,0 +arc_challenge,acc,0.28668941979522183,0.013214986329274776,0 +arc_challenge,acc_norm,0.302901023890785,0.013428241573185347,0 +arc_easy,acc,0.6136363636363636,0.00999129677815963,0 +arc_easy,acc_norm,0.6102693602693603,0.01000716939179705,0 +boolq,acc,0.6024464831804281,0.00855952325693682,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.3314669652855543,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.4684325831507668,0.0049798268294007665,0 +hellaswag,acc_norm,0.6182035451105358,0.004848341560492134,0 +piqa,acc,0.749727965179543,0.010106561880089782,0 +piqa,acc_norm,0.7557127312295974,0.01002476517228425,0 +rte,acc,0.5595667870036101,0.02988212336311871,0 +sciq,acc,0.903,0.009363689373248107,0 +sciq,acc_norm,0.886,0.010055103435823335,0 +storycloze_2016,acc,0.7172634954569749,0.01041380648612127,0 +winogrande,acc,0.5698500394632992,0.013914685094716698,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json deleted file mode 100644 index 10d6818e7e42c4482e1474a8f0ec6c0f143fd7b8..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - }, - "anli_r3": { - "acc": 0.32916666666666666, - "acc_stderr": 0.01357080625843363 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.3314669652855543 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.4684325831507668, - "acc_stderr": 0.0049798268294007665, - "acc_norm": 0.6182035451105358, - "acc_norm_stderr": 0.004848341560492134 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.02988212336311871 - }, - "winogrande": { - "acc": 0.5698500394632992, - "acc_stderr": 0.013914685094716698 - }, - "storycloze_2016": { - "acc": 0.7172634954569749, - "acc_stderr": 0.01041380648612127 - }, - "boolq": { - "acc": 0.6024464831804281, - "acc_stderr": 0.00855952325693682 - }, - "arc_easy": { - "acc": 0.6136363636363636, - "acc_stderr": 0.00999129677815963, - "acc_norm": 0.6102693602693603, - "acc_norm_stderr": 0.01000716939179705 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.013214986329274776, - "acc_norm": 0.302901023890785, - "acc_norm_stderr": 0.013428241573185347 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.009363689373248107, - "acc_norm": 0.886, - "acc_norm_stderr": 0.010055103435823335 - }, - "piqa": { - "acc": 0.749727965179543, - "acc_stderr": 0.010106561880089782, - "acc_norm": 0.7557127312295974, - "acc_norm_stderr": 0.01002476517228425 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..127f0ad7c6da7e4a920ced69303c82532c6e12e9 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.01477082181793464,0 +anli_r2,acc,0.342,0.01500870618212173,0 +anli_r3,acc,0.32083333333333336,0.013480882752851552,0 +arc_challenge,acc,0.28071672354948807,0.013131238126975588,0 +arc_challenge,acc_norm,0.3191126279863481,0.013621696119173297,0 +arc_easy,acc,0.6224747474747475,0.009947227833469432,0 +arc_easy,acc_norm,0.601010101010101,0.010048240683798745,0 +boolq,acc,0.6125382262996942,0.00852066653613694,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.35968427443837275,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.465345548695479,0.004977782217582457,0 +hellaswag,acc_norm,0.6182035451105358,0.004848341560492137,0 +piqa,acc,0.7464635473340587,0.010150090834551782,0 +piqa,acc_norm,0.7557127312295974,0.010024765172284256,0 +rte,acc,0.5595667870036101,0.029882123363118712,0 +sciq,acc,0.905,0.009276910103103305,0 +sciq,acc_norm,0.902,0.009406619184621235,0 +storycloze_2016,acc,0.7242116515232496,0.010334748387645672,0 +winogrande,acc,0.5753749013417522,0.013891893150264224,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json deleted file mode 100644 index 5b948a0034e9368a4c25af8746e8aa66142814a0..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.01477082181793464 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.01500870618212173 - }, - "anli_r3": { - "acc": 0.32083333333333336, - "acc_stderr": 0.013480882752851552 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.35968427443837275 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.465345548695479, - "acc_stderr": 0.004977782217582457, - "acc_norm": 0.6182035451105358, - "acc_norm_stderr": 0.004848341560492137 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118712 - }, - "winogrande": { - "acc": 0.5753749013417522, - "acc_stderr": 0.013891893150264224 - }, - "storycloze_2016": { - "acc": 0.7242116515232496, - "acc_stderr": 0.010334748387645672 - }, - "boolq": { - "acc": 0.6125382262996942, - "acc_stderr": 0.00852066653613694 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.009947227833469432, - "acc_norm": 0.601010101010101, - "acc_norm_stderr": 0.010048240683798745 - }, - "arc_challenge": { - "acc": 0.28071672354948807, - "acc_stderr": 0.013131238126975588, - "acc_norm": 0.3191126279863481, - "acc_norm_stderr": 0.013621696119173297 - }, - "sciq": { - "acc": 0.905, - "acc_stderr": 0.009276910103103305, - "acc_norm": 0.902, - "acc_norm_stderr": 0.009406619184621235 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551782, - "acc_norm": 0.7557127312295974, - "acc_norm_stderr": 0.010024765172284256 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5.csv b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..430c6616e1863fea2d7433290de55f390daa35c1 --- /dev/null +++ b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.308,0.014606483127342763,0 +anli_r2,acc,0.329,0.014865395385928373,0 +anli_r3,acc,0.31916666666666665,0.013462309712005124,0 +arc_challenge,acc,0.28668941979522183,0.01321498632927477,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053057,0 +arc_easy,acc,0.6308922558922558,0.009901987410242747,0 +arc_easy,acc_norm,0.6136363636363636,0.009991296778159615,0 +boolq,acc,0.6168195718654435,0.008503021391450788,1 +cb,acc,0.5714285714285714,0.06672848092813059,1 +cb,f1,0.40095238095238095,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.46634136626170086,0.004978462690966927,0 +hellaswag,acc_norm,0.6188010356502689,0.00484688692976345,0 +piqa,acc,0.7464635473340587,0.010150090834551788,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267314,0 +rte,acc,0.5703971119133574,0.02979666882912467,0 +sciq,acc,0.915,0.008823426366942331,0 +sciq,acc_norm,0.903,0.009363689373248123,0 +storycloze_2016,acc,0.7226082308925709,0.010353267472010767,0 +winogrande,acc,0.5674822415153907,0.013923911578623837,0 diff --git a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json b/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json deleted file mode 100644 index 684984475060b4af2035ddb6b020d7bb8e1affe1..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed1/evaluation/rankeval/4b284b21bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.308, - "acc_stderr": 0.014606483127342763 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928373 - }, - "anli_r3": { - "acc": 0.31916666666666665, - "acc_stderr": 0.013462309712005124 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813059, - "f1": 0.40095238095238095 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.46634136626170086, - "acc_stderr": 0.004978462690966927, - "acc_norm": 0.6188010356502689, - "acc_norm_stderr": 0.00484688692976345 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.02979666882912467 - }, - "winogrande": { - "acc": 0.5674822415153907, - "acc_stderr": 0.013923911578623837 - }, - "storycloze_2016": { - "acc": 0.7226082308925709, - "acc_stderr": 0.010353267472010767 - }, - "boolq": { - "acc": 0.6168195718654435, - "acc_stderr": 0.008503021391450788 - }, - "arc_easy": { - "acc": 0.6308922558922558, - "acc_stderr": 0.009901987410242747, - "acc_norm": 0.6136363636363636, - "acc_norm_stderr": 0.009991296778159615 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.01321498632927477, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053057 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.008823426366942331, - "acc_norm": 0.903, - "acc_norm_stderr": 0.009363689373248123 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551788, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267314 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/generation/merged.csv b/4b284b21bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..4b20f9b2f76d24a075b2a5013179c5c39a03e36e --- /dev/null +++ b/4b284b21bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00032804483755933243 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00032804483755933243 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.14487745848120898 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.14487745848120898 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.16097050153867934 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.16097050153867934 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.17904291710237294 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.17904291710237294 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.18976514478592516 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.18976514478592516 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19513439898691012 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19513439898691012 +e2e_nlg_cleaned,5,average,multiple,0.14501974428877598 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05093449549500333 +gem_xsum,0,median,rouge2_fmeasure,0.05093449549500333 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03807299304872921 +gem_xsum,1,median,rouge2_fmeasure,0.03807299304872921 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03829786428664246 +gem_xsum,2,median,rouge2_fmeasure,0.03829786428664246 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03633421176029688 +gem_xsum,3,median,rouge2_fmeasure,0.03633421176029688 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009377022747940285 +gem_xsum,4,median,rouge2_fmeasure,0.009377022747940285 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005666193916622732 +gem_xsum,5,median,rouge2_fmeasure,0.0005666193916622732 +gem_xsum,5,average,multiple,0.02893053445504574 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04861844568830817 +web_nlg_en,0,median,rouge2_fmeasure,0.04861844568830817 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05672274465750321 +web_nlg_en,1,median,rouge2_fmeasure,0.05672274465750321 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05789739591362932 +web_nlg_en,2,median,rouge2_fmeasure,0.05789739591362932 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05845298625034767 +web_nlg_en,3,median,rouge2_fmeasure,0.05845298625034767 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.06051792994698306 +web_nlg_en,4,median,rouge2_fmeasure,0.06051792994698306 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05997095205778236 +web_nlg_en,5,median,rouge2_fmeasure,0.05997095205778236 +web_nlg_en,5,average,multiple,0.057030075752425635 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.034298445515984884 +wiki_lingua_en,0,median,rouge2_fmeasure,0.034298445515984884 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04668447958482574 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04668447958482574 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05512243219484514 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05512243219484514 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.047560822644652924 +wiki_lingua_en,3,median,rouge2_fmeasure,0.047560822644652924 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015173414928652448 +wiki_lingua_en,4,median,rouge2_fmeasure,0.015173414928652448 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002268834117142696 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002268834117142696 +wiki_lingua_en,5,average,multiple,0.033518071497683974 diff --git a/4b284b21bc4seed2/evaluation/generation/merged.json b/4b284b21bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..72cf81de3bdbc1f69b549afbcf2dfba26f5b97a5 --- /dev/null +++ b/4b284b21bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.32506312223519424, "bleu_stderr": 0.025893772747977932, "rouge1_fmeasure": 0.10433065433582071, "rouge1_fmeasure_stderr": 0.002045800614155292, "rouge1_precision": 0.06859349032117355, "rouge1_precision_stderr": 0.0016038312506390985, "rouge1_recall": 0.2941813489555871, "rouge1_recall_stderr": 0.004674021263394906, "rouge2_fmeasure": 0.04861844568830817, "rouge2_fmeasure_stderr": 0.0012939214664659474, "rouge2_precision": 0.03212658698266832, "rouge2_precision_stderr": 0.0010530735655225297, "rouge2_recall": 0.1386778384920652, "rouge2_recall_stderr": 0.003191413613494901, "rougeL_fmeasure": 0.1002621421342238, "rougeL_fmeasure_stderr": 0.001895084957108719, "rougeL_precision": 0.0656345480369304, "rougeL_precision_stderr": 0.0014573529495999864, "rougeL_recall": 0.28522310200632683, "rougeL_recall_stderr": 0.004520389282639347, "rougeLsum_fmeasure": 0.0995539972933265, "rougeLsum_fmeasure_stderr": 0.0019160684835195488, "rougeLsum_precision": 0.0653980881972399, "rougeLsum_precision_stderr": 0.0015014419295994892, "rougeLsum_recall": 0.28098744657798685, "rougeLsum_recall_stderr": 0.004381748686678463}}, "1": {"PALM_prompt": {"bleu": 0.5118016227412238, "bleu_stderr": 0.042623635278415395, "rouge1_fmeasure": 0.12088955331968472, "rouge1_fmeasure_stderr": 0.0019846793174451657, "rouge1_precision": 0.07757602455405031, "rouge1_precision_stderr": 0.0014534753546664488, "rouge1_recall": 0.3774716404768995, "rouge1_recall_stderr": 0.005305929451821015, "rouge2_fmeasure": 0.05672274465750321, "rouge2_fmeasure_stderr": 0.001258226651168741, "rouge2_precision": 0.0362851421424562, "rouge2_precision_stderr": 0.0008956093546268954, "rouge2_recall": 0.18524426315371975, "rouge2_recall_stderr": 0.003693116007080752, "rougeL_fmeasure": 0.11361139238348571, "rougeL_fmeasure_stderr": 0.0017875116804568483, "rougeL_precision": 0.07274599749024604, "rougeL_precision_stderr": 0.0012926720587857284, "rougeL_recall": 0.355092924706001, "rougeL_recall_stderr": 0.004854299370978627, "rougeLsum_fmeasure": 0.11510174006220572, "rougeLsum_fmeasure_stderr": 0.001863391319217601, "rougeLsum_precision": 0.07390123759045333, "rougeLsum_precision_stderr": 0.0013654602434686925, "rougeLsum_recall": 0.35831256976638354, "rougeLsum_recall_stderr": 0.004899522927326924}}, "2": {"PALM_prompt": {"bleu": 0.5840093945354913, "bleu_stderr": 0.04188339745568021, "rouge1_fmeasure": 0.12465621096807115, "rouge1_fmeasure_stderr": 0.0018703655658613531, "rouge1_precision": 0.07945750342123815, "rouge1_precision_stderr": 0.0014479489207553783, "rouge1_recall": 0.40825594120059006, "rouge1_recall_stderr": 0.005278174669595266, "rouge2_fmeasure": 0.05789739591362932, "rouge2_fmeasure_stderr": 0.001199170279038378, "rouge2_precision": 0.036744757326768296, "rouge2_precision_stderr": 0.0008792578328470093, "rouge2_recall": 0.2014423793700631, "rouge2_recall_stderr": 0.003818247451375373, "rougeL_fmeasure": 0.1154624476975127, "rougeL_fmeasure_stderr": 0.0016597153867437839, "rougeL_precision": 0.07347030137035865, "rougeL_precision_stderr": 0.0012477621361375887, "rougeL_recall": 0.37768137597443896, "rougeL_recall_stderr": 0.004751545894119024, "rougeLsum_fmeasure": 0.11837084927351957, "rougeLsum_fmeasure_stderr": 0.0017500298514746227, "rougeLsum_precision": 0.07540873858516584, "rougeLsum_precision_stderr": 0.0013179555493755973, "rougeLsum_recall": 0.38689757544087583, "rougeLsum_recall_stderr": 0.004882699109184072}}, "3": {"PALM_prompt": {"bleu": 0.6709398206896122, "bleu_stderr": 0.047769423505985516, "rouge1_fmeasure": 0.12486308816974763, "rouge1_fmeasure_stderr": 0.0018141719077548872, "rouge1_precision": 0.07881100544030407, "rouge1_precision_stderr": 0.001304955554970252, "rouge1_recall": 0.4124057146266478, "rouge1_recall_stderr": 0.005246109209443818, "rouge2_fmeasure": 0.05845298625034767, "rouge2_fmeasure_stderr": 0.0011862590488282388, "rouge2_precision": 0.036635961889873093, "rouge2_precision_stderr": 0.0008213071324834968, "rouge2_recall": 0.208225542736028, "rouge2_recall_stderr": 0.0039369080476096795, "rougeL_fmeasure": 0.11521943163869033, "rougeL_fmeasure_stderr": 0.001610539558916774, "rougeL_precision": 0.07274153773680196, "rougeL_precision_stderr": 0.0011633821312709138, "rougeL_recall": 0.38089519585678455, "rougeL_recall_stderr": 0.004728063182941111, "rougeLsum_fmeasure": 0.1183484530055362, "rougeLsum_fmeasure_stderr": 0.0017083674826636145, "rougeLsum_precision": 0.07477045841836422, "rougeLsum_precision_stderr": 0.0012355331377793054, "rougeLsum_recall": 0.3904327127987759, "rougeLsum_recall_stderr": 0.004884028804456976}}, "4": {"PALM_prompt": {"bleu": 0.717226798564371, "bleu_stderr": 0.05235423902349165, "rouge1_fmeasure": 0.1295714035230327, "rouge1_fmeasure_stderr": 0.0018420327565204727, "rouge1_precision": 0.08203688523709225, "rouge1_precision_stderr": 0.0013485494203466766, "rouge1_recall": 0.42140862503962817, "rouge1_recall_stderr": 0.005164591516935246, "rouge2_fmeasure": 0.06051792994698306, "rouge2_fmeasure_stderr": 0.0012071931655537776, "rouge2_precision": 0.03804051677909412, "rouge2_precision_stderr": 0.000845643440624378, "rouge2_recall": 0.21149985749947484, "rouge2_recall_stderr": 0.003860620724040806, "rougeL_fmeasure": 0.11864284534704603, "rougeL_fmeasure_stderr": 0.0016324931494254408, "rougeL_precision": 0.07507594990590853, "rougeL_precision_stderr": 0.0011943518000838561, "rougeL_recall": 0.38649010128141226, "rougeL_recall_stderr": 0.004628444756913378, "rougeLsum_fmeasure": 0.12258797563270211, "rougeLsum_fmeasure_stderr": 0.0017216462979010524, "rougeLsum_precision": 0.07768635173205395, "rougeLsum_precision_stderr": 0.0012677627883765405, "rougeLsum_recall": 0.3979491074259348, "rougeLsum_recall_stderr": 0.004743571661336718}}, "5": {"PALM_prompt": {"bleu": 0.7837024299010963, "bleu_stderr": 0.04687108913758884, "rouge1_fmeasure": 0.12867770755532376, "rouge1_fmeasure_stderr": 0.0017907995403965442, "rouge1_precision": 0.08098981255155412, "rouge1_precision_stderr": 0.0012986711907095154, "rouge1_recall": 0.4294396683796783, "rouge1_recall_stderr": 0.005256939492907656, "rouge2_fmeasure": 0.05997095205778236, "rouge2_fmeasure_stderr": 0.0011763036318809183, "rouge2_precision": 0.03749292107154646, "rouge2_precision_stderr": 0.0008203811898837323, "rouge2_recall": 0.21661276600273288, "rouge2_recall_stderr": 0.003990616331353259, "rougeL_fmeasure": 0.11712871574656816, "rougeL_fmeasure_stderr": 0.0015783462905322296, "rougeL_precision": 0.07372036929996381, "rougeL_precision_stderr": 0.0011482103405321168, "rougeL_recall": 0.3915126182515876, "rougeL_recall_stderr": 0.0046668052784097185, "rougeLsum_fmeasure": 0.1214298733155526, "rougeLsum_fmeasure_stderr": 0.0016873280471566882, "rougeLsum_precision": 0.07647590415348687, "rougeLsum_precision_stderr": 0.0012280522580962734, "rougeLsum_recall": 0.40419873137833984, "rougeLsum_recall_stderr": 0.004827520663821318}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4608584188826386, "bleu_stderr": 0.0530791011108354, "rouge1_fmeasure": 0.17322809310474896, "rouge1_fmeasure_stderr": 0.0017908446486168677, "rouge1_precision": 0.14654900702824386, "rouge1_precision_stderr": 0.0018063404204406743, "rouge1_recall": 0.25503720319711043, "rouge1_recall_stderr": 0.0026160039796892633, "rouge2_fmeasure": 0.034298445515984884, "rouge2_fmeasure_stderr": 0.0008203354371417247, "rouge2_precision": 0.028612548548136534, "rouge2_precision_stderr": 0.0007127877744078934, "rouge2_recall": 0.05279174190112992, "rouge2_recall_stderr": 0.001379618447010414, "rougeL_fmeasure": 0.13592864767523646, "rougeL_fmeasure_stderr": 0.0012935373206819489, "rougeL_precision": 0.11363888589301006, "rougeL_precision_stderr": 0.0012703980384269685, "rougeL_recall": 0.20501713486527137, "rougeL_recall_stderr": 0.002147517762826042, "rougeLsum_fmeasure": 0.1599177121150833, "rougeLsum_fmeasure_stderr": 0.0016445158655453984, "rougeLsum_precision": 0.13510478941478388, "rougeLsum_precision_stderr": 0.0016545178102930402, "rougeLsum_recall": 0.23604870436592665, "rougeLsum_recall_stderr": 0.002421328558194817}}, "1": {"tldr_en": {"bleu": 2.3966066814918157, "bleu_stderr": 0.05367929210307168, "rouge1_fmeasure": 0.1982032455796027, "rouge1_fmeasure_stderr": 0.0020176984061830703, "rouge1_precision": 0.1734392435728412, "rouge1_precision_stderr": 0.0022005941412746063, "rouge1_recall": 0.2835869137995969, "rouge1_recall_stderr": 0.002885685597988053, "rouge2_fmeasure": 0.04668447958482574, "rouge2_fmeasure_stderr": 0.0010022474154175907, "rouge2_precision": 0.04149997490553237, "rouge2_precision_stderr": 0.0010674584264283784, "rouge2_recall": 0.06868219059074646, "rouge2_recall_stderr": 0.0016243497803729017, "rougeL_fmeasure": 0.14863788313374796, "rougeL_fmeasure_stderr": 0.0013761560951888873, "rougeL_precision": 0.12919613127158547, "rougeL_precision_stderr": 0.0015642451075119819, "rougeL_recall": 0.21761320862142447, "rougeL_recall_stderr": 0.0022591710241408394, "rougeLsum_fmeasure": 0.18428785278478904, "rougeLsum_fmeasure_stderr": 0.0018795396325519526, "rougeLsum_precision": 0.16103026799665304, "rougeLsum_precision_stderr": 0.002048994341301177, "rougeLsum_recall": 0.26462971235663546, "rougeLsum_recall_stderr": 0.002734346549211203}}, "2": {"tldr_en": {"bleu": 2.8945524687514457, "bleu_stderr": 0.07184698921333843, "rouge1_fmeasure": 0.22000975094048222, "rouge1_fmeasure_stderr": 0.0019196293722291687, "rouge1_precision": 0.19457689329314268, "rouge1_precision_stderr": 0.002260756056763997, "rouge1_recall": 0.314039100456002, "rouge1_recall_stderr": 0.0027595985462308755, "rouge2_fmeasure": 0.05512243219484514, "rouge2_fmeasure_stderr": 0.0010318753523195182, "rouge2_precision": 0.0494730237951709, "rouge2_precision_stderr": 0.0011272834928775885, "rouge2_recall": 0.0810667814947236, "rouge2_recall_stderr": 0.0016990089475787328, "rougeL_fmeasure": 0.15989123909343908, "rougeL_fmeasure_stderr": 0.0013276007999853572, "rougeL_precision": 0.14093298706922133, "rougeL_precision_stderr": 0.001654821217199275, "rougeL_recall": 0.23368231370987727, "rougeL_recall_stderr": 0.0022516888453590965, "rougeLsum_fmeasure": 0.20653141298637975, "rougeLsum_fmeasure_stderr": 0.0017934155889773602, "rougeLsum_precision": 0.1825978062155656, "rougeLsum_precision_stderr": 0.002130225159997019, "rougeLsum_recall": 0.29579669341488013, "rougeLsum_recall_stderr": 0.002637070044280065}}, "3": {"tldr_en": {"bleu": 3.0602276174662797, "bleu_stderr": 0.08140252545516478, "rouge1_fmeasure": 0.18719041840682565, "rouge1_fmeasure_stderr": 0.002263749209581055, "rouge1_precision": 0.17154995366414352, "rouge1_precision_stderr": 0.002562308241187173, "rouge1_recall": 0.2676757228726807, "rouge1_recall_stderr": 0.0033611913545249297, "rouge2_fmeasure": 0.047560822644652924, "rouge2_fmeasure_stderr": 0.0010445651116964955, "rouge2_precision": 0.04292321928703632, "rouge2_precision_stderr": 0.001064702716287979, "rouge2_recall": 0.07068433635722526, "rouge2_recall_stderr": 0.0017497586284916746, "rougeL_fmeasure": 0.1354379034088655, "rougeL_fmeasure_stderr": 0.0016042022099017302, "rougeL_precision": 0.12421934115552721, "rougeL_precision_stderr": 0.0019015982358437386, "rougeL_recall": 0.19821540363389906, "rougeL_recall_stderr": 0.0026681499144326185, "rougeLsum_fmeasure": 0.17652791528010714, "rougeLsum_fmeasure_stderr": 0.0021286102824532036, "rougeLsum_precision": 0.16183366894292667, "rougeLsum_precision_stderr": 0.0024239308899604867, "rougeLsum_recall": 0.2533167464662618, "rougeLsum_recall_stderr": 0.0032102990450269485}}, "4": {"tldr_en": {"bleu": 0.6934835415911031, "bleu_stderr": 0.049667470868950084, "rouge1_fmeasure": 0.06085530545718036, "rouge1_fmeasure_stderr": 0.00203307390969258, "rouge1_precision": 0.05713222279079333, "rouge1_precision_stderr": 0.0021266654785382483, "rouge1_recall": 0.09058330612542118, "rouge1_recall_stderr": 0.003079513808745527, "rouge2_fmeasure": 0.015173414928652448, "rouge2_fmeasure_stderr": 0.0007150751207413451, "rouge2_precision": 0.014207618224936538, "rouge2_precision_stderr": 0.0008214073022225183, "rouge2_recall": 0.02408636460856734, "rouge2_recall_stderr": 0.0012343919074854972, "rougeL_fmeasure": 0.0450507360001391, "rougeL_fmeasure_stderr": 0.0014926765541292855, "rougeL_precision": 0.042414718667727, "rougeL_precision_stderr": 0.0015987476163280138, "rougeL_recall": 0.06883510681650253, "rougeL_recall_stderr": 0.0024186528354051195, "rougeLsum_fmeasure": 0.05703443465650266, "rougeLsum_fmeasure_stderr": 0.0019072034434628312, "rougeLsum_precision": 0.05347341120866329, "rougeLsum_precision_stderr": 0.001986260479403247, "rougeLsum_recall": 0.08506757665131119, "rougeLsum_recall_stderr": 0.002909995691190911}}, "5": {"tldr_en": {"bleu": 1.581309668123882e-06, "bleu_stderr": 2.4592802738754113e-06, "rouge1_fmeasure": 0.009690917181266634, "rouge1_fmeasure_stderr": 0.0008967008433661314, "rouge1_precision": 0.009326365925988343, "rouge1_precision_stderr": 0.0009238891705239445, "rouge1_recall": 0.014943156822805441, "rouge1_recall_stderr": 0.0014270358715897308, "rouge2_fmeasure": 0.002268834117142696, "rouge2_fmeasure_stderr": 0.0002916739063088069, "rouge2_precision": 0.0019508123925038809, "rouge2_precision_stderr": 0.0002665571578964472, "rouge2_recall": 0.00398100278797549, "rouge2_recall_stderr": 0.0005950165807991629, "rougeL_fmeasure": 0.006943264579604218, "rougeL_fmeasure_stderr": 0.0006330411491022215, "rougeL_precision": 0.006685088969622301, "rougeL_precision_stderr": 0.0006601897650917386, "rougeL_recall": 0.01106204631934071, "rougeL_recall_stderr": 0.0010938254411800782, "rougeLsum_fmeasure": 0.00910140326625081, "rougeLsum_fmeasure_stderr": 0.0008378756467906282, "rougeLsum_precision": 0.008768623444343692, "rougeLsum_precision_stderr": 0.0008666782347407663, "rougeLsum_recall": 0.014121358378765587, "rougeLsum_recall_stderr": 0.001350760934994208}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.015589228807418672, "bleu_stderr": 0.003691217032514631, "rouge1_fmeasure": 0.03512570831981043, "rouge1_fmeasure_stderr": 0.0007028508250409884, "rouge1_precision": 0.08238834340384042, "rouge1_precision_stderr": 0.0016031811195891402, "rouge1_recall": 0.025430011369854285, "rouge1_recall_stderr": 0.0006498826885017055, "rouge2_fmeasure": 0.00032804483755933243, "rouge2_fmeasure_stderr": 8.20801366964415e-05, "rouge2_precision": 0.0009723255571910732, "rouge2_precision_stderr": 0.00024982188929618097, "rouge2_recall": 0.00029303535674764464, "rouge2_recall_stderr": 9.939261536575641e-05, "rougeL_fmeasure": 0.03480594809918208, "rougeL_fmeasure_stderr": 0.0006948353713699121, "rougeL_precision": 0.08144389895939601, "rougeL_precision_stderr": 0.0015698164934837453, "rougeL_recall": 0.025235905197915645, "rougeL_recall_stderr": 0.000647019518371156, "rougeLsum_fmeasure": 0.03432140843843363, "rougeLsum_fmeasure_stderr": 0.0006753530356000441, "rougeLsum_precision": 0.08173127808720583, "rougeLsum_precision_stderr": 0.0016008167026965942, "rougeLsum_recall": 0.024195685454103392, "rougeLsum_recall_stderr": 0.0005527347315330655}}, "1": {"generate_text_restaurant": {"bleu": 6.836179529973976, "bleu_stderr": 0.0808622381692175, "rouge1_fmeasure": 0.35350598832846664, "rouge1_fmeasure_stderr": 0.0020049438944572543, "rouge1_precision": 0.32884898005646757, "rouge1_precision_stderr": 0.002746055697580347, "rouge1_recall": 0.44575132680164475, "rouge1_recall_stderr": 0.002825021204102268, "rouge2_fmeasure": 0.14487745848120898, "rouge2_fmeasure_stderr": 0.0015129362043146879, "rouge2_precision": 0.13607489118649332, "rouge2_precision_stderr": 0.0018147007082345516, "rouge2_recall": 0.18458105214823597, "rouge2_recall_stderr": 0.0020132959428884193, "rougeL_fmeasure": 0.2625755338174827, "rougeL_fmeasure_stderr": 0.001535900050678149, "rougeL_precision": 0.24335207327056155, "rougeL_precision_stderr": 0.0021318513982543723, "rougeL_recall": 0.33532204803532295, "rougeL_recall_stderr": 0.002383375008474535, "rougeLsum_fmeasure": 0.2893417910289629, "rougeLsum_fmeasure_stderr": 0.0019000123325331364, "rougeLsum_precision": 0.26974981288358574, "rougeLsum_precision_stderr": 0.002469140003825862, "rougeLsum_recall": 0.3644056974222114, "rougeLsum_recall_stderr": 0.0026205219344870726}}, "2": {"generate_text_restaurant": {"bleu": 7.593692630726807, "bleu_stderr": 0.12147840458175509, "rouge1_fmeasure": 0.3650716348139436, "rouge1_fmeasure_stderr": 0.0018388019470279935, "rouge1_precision": 0.32070413847189994, "rouge1_precision_stderr": 0.002359554141426242, "rouge1_recall": 0.47928591751649846, "rouge1_recall_stderr": 0.002722114189800939, "rouge2_fmeasure": 0.16097050153867934, "rouge2_fmeasure_stderr": 0.0014656932148909823, "rouge2_precision": 0.14103295618834455, "rouge2_precision_stderr": 0.0015746461899898117, "rouge2_recall": 0.21563029482851057, "rouge2_recall_stderr": 0.002136380088242833, "rougeL_fmeasure": 0.27596556132236433, "rougeL_fmeasure_stderr": 0.0014888754707697203, "rougeL_precision": 0.24046287771000435, "rougeL_precision_stderr": 0.001798012166542668, "rougeL_recall": 0.3679421301525328, "rougeL_recall_stderr": 0.0025079121076047625, "rougeLsum_fmeasure": 0.3002423567504504, "rougeLsum_fmeasure_stderr": 0.0018046105124439181, "rougeLsum_precision": 0.2637914012671533, "rougeLsum_precision_stderr": 0.0021472511284155046, "rougeLsum_recall": 0.3948037364571785, "rougeLsum_recall_stderr": 0.002648544902201612}}, "3": {"generate_text_restaurant": {"bleu": 8.742374199936782, "bleu_stderr": 0.09424505315134048, "rouge1_fmeasure": 0.3899947455542205, "rouge1_fmeasure_stderr": 0.0019080696284148931, "rouge1_precision": 0.35039399690574724, "rouge1_precision_stderr": 0.002305847291735412, "rouge1_recall": 0.48660889841428495, "rouge1_recall_stderr": 0.002662644687856354, "rouge2_fmeasure": 0.17904291710237294, "rouge2_fmeasure_stderr": 0.0015715270006421594, "rouge2_precision": 0.1599868739123523, "rouge2_precision_stderr": 0.0015782362564147653, "rouge2_recall": 0.227448454804275, "rouge2_recall_stderr": 0.0021747446525376547, "rougeL_fmeasure": 0.2920617305214459, "rougeL_fmeasure_stderr": 0.0015719969421114034, "rougeL_precision": 0.2602583118911549, "rougeL_precision_stderr": 0.001737371791496901, "rougeL_recall": 0.36945692088555687, "rougeL_recall_stderr": 0.0024499926133160622, "rougeLsum_fmeasure": 0.32444631733345547, "rougeLsum_fmeasure_stderr": 0.0018967884322166942, "rougeLsum_precision": 0.2913246360455281, "rougeLsum_precision_stderr": 0.002124454204809321, "rougeLsum_recall": 0.40523354266323236, "rougeLsum_recall_stderr": 0.0026084638294323506}}, "4": {"generate_text_restaurant": {"bleu": 9.715088335051762, "bleu_stderr": 0.1753473511599001, "rouge1_fmeasure": 0.40775875197875355, "rouge1_fmeasure_stderr": 0.0019664206284239825, "rouge1_precision": 0.37658509845981214, "rouge1_precision_stderr": 0.002384636357332195, "rouge1_recall": 0.486714622072443, "rouge1_recall_stderr": 0.0026221329215418985, "rouge2_fmeasure": 0.18976514478592516, "rouge2_fmeasure_stderr": 0.0016958589616437444, "rouge2_precision": 0.1747916801012778, "rouge2_precision_stderr": 0.0017350307207232446, "rouge2_recall": 0.22931314976608455, "rouge2_recall_stderr": 0.0022024157955143195, "rougeL_fmeasure": 0.3023558736561066, "rougeL_fmeasure_stderr": 0.0016456600817606013, "rougeL_precision": 0.2776183901125436, "rougeL_precision_stderr": 0.001855703858157366, "rougeL_recall": 0.36494128754990124, "rougeL_recall_stderr": 0.0023932425770631528, "rougeLsum_fmeasure": 0.3424660839752939, "rougeLsum_fmeasure_stderr": 0.001996421212700881, "rougeLsum_precision": 0.3162075740349658, "rougeLsum_precision_stderr": 0.0022600138968863186, "rougeLsum_recall": 0.40902136651768844, "rougeLsum_recall_stderr": 0.0026079274805328985}}, "5": {"generate_text_restaurant": {"bleu": 10.31740400799245, "bleu_stderr": 0.17344138200062734, "rouge1_fmeasure": 0.4160635303296256, "rouge1_fmeasure_stderr": 0.0019167446694135697, "rouge1_precision": 0.3911617622914696, "rouge1_precision_stderr": 0.0023614083172843452, "rouge1_recall": 0.4840060164486427, "rouge1_recall_stderr": 0.0025607770603185355, "rouge2_fmeasure": 0.19513439898691012, "rouge2_fmeasure_stderr": 0.0016801719843503758, "rouge2_precision": 0.18311975241328707, "rouge2_precision_stderr": 0.0017530732674177275, "rouge2_recall": 0.22944970891513594, "rouge2_recall_stderr": 0.002134666804521969, "rougeL_fmeasure": 0.30846083197605884, "rougeL_fmeasure_stderr": 0.001637729985430372, "rougeL_precision": 0.28869111802776204, "rougeL_precision_stderr": 0.001879336189442547, "rougeL_recall": 0.36202682605226727, "rougeL_recall_stderr": 0.0023254592064409552, "rougeLsum_fmeasure": 0.3513163230656758, "rougeLsum_fmeasure_stderr": 0.001957980092746368, "rougeLsum_precision": 0.33010797860958013, "rougeLsum_precision_stderr": 0.0022472175989479344, "rougeLsum_recall": 0.4090265340755352, "rougeLsum_recall_stderr": 0.002546467237868763}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1407026541783893, "bleu_stderr": 0.09037567653449961, "rouge1_fmeasure": 0.2130534919539574, "rouge1_fmeasure_stderr": 0.0025029827276464548, "rouge1_precision": 0.15787173182833486, "rouge1_precision_stderr": 0.0020401967493817856, "rouge1_recall": 0.3575349440272045, "rouge1_recall_stderr": 0.0044992439078584915, "rouge2_fmeasure": 0.05093449549500333, "rouge2_fmeasure_stderr": 0.0015982440427958716, "rouge2_precision": 0.036995243747182334, "rouge2_precision_stderr": 0.001189777484488148, "rouge2_recall": 0.08887777631236293, "rouge2_recall_stderr": 0.0028946858955236896, "rougeL_fmeasure": 0.15848964083025063, "rougeL_fmeasure_stderr": 0.001895901022123941, "rougeL_precision": 0.11735551251948108, "rougeL_precision_stderr": 0.0015519522146072052, "rougeL_recall": 0.26754759258055894, "rougeL_recall_stderr": 0.0035643834192978716, "rougeLsum_fmeasure": 0.16921433699659624, "rougeLsum_fmeasure_stderr": 0.0021331443008039677, "rougeLsum_precision": 0.12511375969112523, "rougeLsum_precision_stderr": 0.0016999095838239333, "rougeLsum_recall": 0.285784101072217, "rougeLsum_recall_stderr": 0.003970274987518834}}, "1": {"article_DOC_summary": {"bleu": 1.5258803326642025, "bleu_stderr": 0.04802303753716418, "rouge1_fmeasure": 0.18060678306331202, "rouge1_fmeasure_stderr": 0.0025809573229702634, "rouge1_precision": 0.12866810306322857, "rouge1_precision_stderr": 0.0019102479517656475, "rouge1_recall": 0.3152190979928625, "rouge1_recall_stderr": 0.004462238018541599, "rouge2_fmeasure": 0.03807299304872921, "rouge2_fmeasure_stderr": 0.0014688491194977736, "rouge2_precision": 0.02676557417591505, "rouge2_precision_stderr": 0.0010345920327863589, "rouge2_recall": 0.06868703168878479, "rouge2_recall_stderr": 0.0027290584553812565, "rougeL_fmeasure": 0.14041635566457955, "rougeL_fmeasure_stderr": 0.0019382870753545767, "rougeL_precision": 0.09984151880119294, "rougeL_precision_stderr": 0.0014227004264263632, "rougeL_recall": 0.24649628118013558, "rougeL_recall_stderr": 0.003467176011262577, "rougeLsum_fmeasure": 0.1431680713585917, "rougeLsum_fmeasure_stderr": 0.0021366368130727436, "rougeLsum_precision": 0.10174290101033899, "rougeLsum_precision_stderr": 0.0015628179713401368, "rougeLsum_recall": 0.2515700448017643, "rougeLsum_recall_stderr": 0.003812752448423601}}, "2": {"article_DOC_summary": {"bleu": 1.4807110641239705, "bleu_stderr": 0.05642507718368326, "rouge1_fmeasure": 0.17919800155343935, "rouge1_fmeasure_stderr": 0.0024773560103552544, "rouge1_precision": 0.12762605157499543, "rouge1_precision_stderr": 0.0018347498284269797, "rouge1_recall": 0.3130722022576149, "rouge1_recall_stderr": 0.004288622872438158, "rouge2_fmeasure": 0.03829786428664246, "rouge2_fmeasure_stderr": 0.0014210624037590423, "rouge2_precision": 0.027038903613225806, "rouge2_precision_stderr": 0.0010036982915438686, "rouge2_recall": 0.06847210404764151, "rouge2_recall_stderr": 0.0026456020226303906, "rougeL_fmeasure": 0.14394583719234913, "rougeL_fmeasure_stderr": 0.0019028216895477855, "rougeL_precision": 0.10241448305776109, "rougeL_precision_stderr": 0.001406273732891567, "rougeL_recall": 0.2524259625919352, "rougeL_recall_stderr": 0.0033729592986806267, "rougeLsum_fmeasure": 0.14111487802968178, "rougeLsum_fmeasure_stderr": 0.0020103034855313232, "rougeLsum_precision": 0.1002779789628322, "rougeLsum_precision_stderr": 0.001472636844565553, "rougeLsum_recall": 0.24825857387427142, "rougeLsum_recall_stderr": 0.0036037687382947966}}, "3": {"article_DOC_summary": {"bleu": 1.5480716201391962, "bleu_stderr": 0.04259756224632486, "rouge1_fmeasure": 0.1708616882829907, "rouge1_fmeasure_stderr": 0.0026372448948957907, "rouge1_precision": 0.1245146875791404, "rouge1_precision_stderr": 0.0020452410734284427, "rouge1_recall": 0.29245279417648024, "rouge1_recall_stderr": 0.004518130032710362, "rouge2_fmeasure": 0.03633421176029688, "rouge2_fmeasure_stderr": 0.001423553912853919, "rouge2_precision": 0.025988881798573865, "rouge2_precision_stderr": 0.0010211664215581159, "rouge2_recall": 0.06407684856222447, "rouge2_recall_stderr": 0.002609965847216835, "rougeL_fmeasure": 0.13811589968950733, "rougeL_fmeasure_stderr": 0.002059269651225793, "rougeL_precision": 0.10062009439170647, "rougeL_precision_stderr": 0.0016054032423320594, "rougeL_recall": 0.23738866114842447, "rougeL_recall_stderr": 0.0036328476884833756, "rougeLsum_fmeasure": 0.13501624474663962, "rougeLsum_fmeasure_stderr": 0.0021698097402404657, "rougeLsum_precision": 0.09834084127829348, "rougeLsum_precision_stderr": 0.0016818769932268626, "rougeLsum_recall": 0.23253064424615483, "rougeLsum_recall_stderr": 0.0038178300293972766}}, "4": {"article_DOC_summary": {"bleu": 0.6276187214183829, "bleu_stderr": 0.10466473087339175, "rouge1_fmeasure": 0.0475846797438099, "rouge1_fmeasure_stderr": 0.00265141713171967, "rouge1_precision": 0.043316822403892084, "rouge1_precision_stderr": 0.0028900125408040456, "rouge1_recall": 0.07276841656671419, "rouge1_recall_stderr": 0.004133235807638063, "rouge2_fmeasure": 0.009377022747940285, "rouge2_fmeasure_stderr": 0.000854007658898027, "rouge2_precision": 0.0093576557082385, "rouge2_precision_stderr": 0.0014626698630045996, "rouge2_recall": 0.014492420646261656, "rouge2_recall_stderr": 0.0013238813272262598, "rougeL_fmeasure": 0.03832605984378281, "rougeL_fmeasure_stderr": 0.0021137783954494655, "rougeL_precision": 0.035639613494581245, "rougeL_precision_stderr": 0.0025150520302418987, "rougeL_recall": 0.05854363623579474, "rougeL_recall_stderr": 0.003308989848879334, "rougeLsum_fmeasure": 0.03855249240002344, "rougeLsum_fmeasure_stderr": 0.002170801274633859, "rougeLsum_precision": 0.035795061037585306, "rougeLsum_precision_stderr": 0.002539847586499542, "rougeLsum_recall": 0.05893322820138485, "rougeLsum_recall_stderr": 0.0034149524939063266}}, "5": {"article_DOC_summary": {"bleu": 4.750313828560887e-40, "bleu_stderr": 1.219522064952425e-32, "rouge1_fmeasure": 0.002884301686287677, "rouge1_fmeasure_stderr": 0.0007752057983486471, "rouge1_precision": 0.003333596131072619, "rouge1_precision_stderr": 0.0009161215054684143, "rouge1_recall": 0.002627738914142911, "rouge1_recall_stderr": 0.0007104755333532985, "rouge2_fmeasure": 0.0005666193916622732, "rouge2_fmeasure_stderr": 0.0002695930799049985, "rouge2_precision": 0.0006633042010400501, "rouge2_precision_stderr": 0.0002992483341086919, "rouge2_recall": 0.0005217005217005217, "rouge2_recall_stderr": 0.0002680851938493634, "rougeL_fmeasure": 0.002387639095962103, "rougeL_fmeasure_stderr": 0.000654167470455253, "rougeL_precision": 0.0027764850648050597, "rougeL_precision_stderr": 0.0007798529533808385, "rougeL_recall": 0.0021689895202009484, "rougeL_recall_stderr": 0.0005997028831033762, "rougeLsum_fmeasure": 0.0023944782740570364, "rougeLsum_fmeasure_stderr": 0.0006504412933734339, "rougeLsum_precision": 0.002779638127059121, "rougeLsum_precision_stderr": 0.0007743437565217287, "rougeLsum_recall": 0.0021785988527847533, "rougeLsum_recall_stderr": 0.000597762769700717}}}} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..cd9810803f6724f532b71c4adf852bdbf76d7663 --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795023,0 +anli_r2,acc,0.333,0.014910846164229864,0 +anli_r3,acc,0.3516666666666667,0.013789711695404785,0 +arc_challenge,acc,0.27559726962457337,0.01305716965576184,0 +arc_challenge,acc_norm,0.30631399317406144,0.013470584417276511,0 +arc_easy,acc,0.6018518518518519,0.010044662374653398,0 +arc_easy,acc_norm,0.5214646464646465,0.010250325159456652,0 +boolq,acc,0.6100917431192661,0.008530437972862622,1 +cb,acc,0.2857142857142857,0.06091449038731724,1 +cb,f1,0.24845800389121164,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.47918741286596295,0.004985456752161002,0 +hellaswag,acc_norm,0.6287592113124876,0.004821492994082102,0 +piqa,acc,0.750816104461371,0.010091882770120216,0 +piqa,acc_norm,0.7616974972796517,0.009940334245876219,0 +rte,acc,0.5306859205776173,0.03003973059219781,0 +sciq,acc,0.852,0.01123486636423524,0 +sciq,acc_norm,0.768,0.01335493745228157,0 +storycloze_2016,acc,0.7087119187600214,0.010506919924163614,0 +winogrande,acc,0.5816890292028414,0.013863669961195904,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json deleted file mode 100644 index 6c94933631beba39e768e6c81937a5944aca403c..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795023 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229864 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404785 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.06091449038731724, - "f1": 0.24845800389121164 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.47918741286596295, - "acc_stderr": 0.004985456752161002, - "acc_norm": 0.6287592113124876, - "acc_norm_stderr": 0.004821492994082102 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.03003973059219781 - }, - "winogrande": { - "acc": 0.5816890292028414, - "acc_stderr": 0.013863669961195904 - }, - "storycloze_2016": { - "acc": 0.7087119187600214, - "acc_stderr": 0.010506919924163614 - }, - "boolq": { - "acc": 0.6100917431192661, - "acc_stderr": 0.008530437972862622 - }, - "arc_easy": { - "acc": 0.6018518518518519, - "acc_stderr": 0.010044662374653398, - "acc_norm": 0.5214646464646465, - "acc_norm_stderr": 0.010250325159456652 - }, - "arc_challenge": { - "acc": 0.27559726962457337, - "acc_stderr": 0.01305716965576184, - "acc_norm": 0.30631399317406144, - "acc_norm_stderr": 0.013470584417276511 - }, - "sciq": { - "acc": 0.852, - "acc_stderr": 0.01123486636423524, - "acc_norm": 0.768, - "acc_norm_stderr": 0.01335493745228157 - }, - "piqa": { - "acc": 0.750816104461371, - "acc_stderr": 0.010091882770120216, - "acc_norm": 0.7616974972796517, - "acc_norm_stderr": 0.009940334245876219 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..5e8720f85d8b2534f362636cc774fe1f467b91a0 --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.343,0.015019206922356951,0 +anli_r2,acc,0.325,0.014818724459095527,0 +anli_r3,acc,0.33666666666666667,0.013647602942406393,0 +arc_challenge,acc,0.30887372013651876,0.013501770929344003,0 +arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0 +arc_easy,acc,0.617003367003367,0.009974920384536462,0 +arc_easy,acc_norm,0.5761784511784511,0.01014000609521361,0 +boolq,acc,0.617125382262997,0.008501734385335953,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.39080213903743316,,1 +copa,acc,0.74,0.04408440022768079,0 +hellaswag,acc,0.47759410476000796,0.004984768912326932,0 +hellaswag,acc_norm,0.6294562836088429,0.004819633668832537,0 +piqa,acc,0.750272034820457,0.010099232969867492,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804461,0 +rte,acc,0.5523465703971119,0.029931070362939533,0 +sciq,acc,0.906,0.009233052000787726,0 +sciq,acc_norm,0.885,0.010093407594904628,0 +storycloze_2016,acc,0.706574024585783,0.010529489334744471,0 +winogrande,acc,0.5730071033938438,0.013901878072575057,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json deleted file mode 100644 index 7d8f14610e765accd5fe7ed1d6948fd7a6f89e5c..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356951 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095527 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406393 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.39080213903743316 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768079 - }, - "hellaswag": { - "acc": 0.47759410476000796, - "acc_stderr": 0.004984768912326932, - "acc_norm": 0.6294562836088429, - "acc_norm_stderr": 0.004819633668832537 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939533 - }, - "winogrande": { - "acc": 0.5730071033938438, - "acc_stderr": 0.013901878072575057 - }, - "storycloze_2016": { - "acc": 0.706574024585783, - "acc_stderr": 0.010529489334744471 - }, - "boolq": { - "acc": 0.617125382262997, - "acc_stderr": 0.008501734385335953 - }, - "arc_easy": { - "acc": 0.617003367003367, - "acc_stderr": 0.009974920384536462, - "acc_norm": 0.5761784511784511, - "acc_norm_stderr": 0.01014000609521361 - }, - "arc_challenge": { - "acc": 0.30887372013651876, - "acc_stderr": 0.013501770929344003, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.01359243151906808 - }, - "sciq": { - "acc": 0.906, - "acc_stderr": 0.009233052000787726, - "acc_norm": 0.885, - "acc_norm_stderr": 0.010093407594904628 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867492, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804461 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..503fea2939633152499a737e74b8f85d4397116a --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.347,0.015060472031706618,0 +anli_r3,acc,0.3275,0.013553211167251953,0 +arc_challenge,acc,0.30716723549488056,0.013481034054980945,0 +arc_challenge,acc_norm,0.33276450511945393,0.013769863046192314,0 +arc_easy,acc,0.6254208754208754,0.00993175882041061,0 +arc_easy,acc_norm,0.5993265993265994,0.010055304474255585,0 +boolq,acc,0.636697247706422,0.008411885836787163,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.26868521549372615,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.47470623381796456,0.004983392650570956,0 +hellaswag,acc_norm,0.6330412268472416,0.004809901151234834,0 +piqa,acc,0.749727965179543,0.010106561880089786,0 +piqa,acc_norm,0.76550598476605,0.00988520314324054,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.904,0.009320454434783222,0 +sciq,acc_norm,0.893,0.009779910359847165,0 +storycloze_2016,acc,0.7188669160876536,0.010395836091628113,0 +winogrande,acc,0.5816890292028414,0.013863669961195918,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json deleted file mode 100644 index 2746e8034ab12b710df73a97bcfc7d5ce2bb1c3e..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706618 - }, - "anli_r3": { - "acc": 0.3275, - "acc_stderr": 0.013553211167251953 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.26868521549372615 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.47470623381796456, - "acc_stderr": 0.004983392650570956, - "acc_norm": 0.6330412268472416, - "acc_norm_stderr": 0.004809901151234834 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5816890292028414, - "acc_stderr": 0.013863669961195918 - }, - "storycloze_2016": { - "acc": 0.7188669160876536, - "acc_stderr": 0.010395836091628113 - }, - "boolq": { - "acc": 0.636697247706422, - "acc_stderr": 0.008411885836787163 - }, - "arc_easy": { - "acc": 0.6254208754208754, - "acc_stderr": 0.00993175882041061, - "acc_norm": 0.5993265993265994, - "acc_norm_stderr": 0.010055304474255585 - }, - "arc_challenge": { - "acc": 0.30716723549488056, - "acc_stderr": 0.013481034054980945, - "acc_norm": 0.33276450511945393, - "acc_norm_stderr": 0.013769863046192314 - }, - "sciq": { - "acc": 0.904, - "acc_stderr": 0.009320454434783222, - "acc_norm": 0.893, - "acc_norm_stderr": 0.009779910359847165 - }, - "piqa": { - "acc": 0.749727965179543, - "acc_stderr": 0.010106561880089786, - "acc_norm": 0.76550598476605, - "acc_norm_stderr": 0.00988520314324054 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..bbdcd54eb970709277856a21e7f4579288cb6aa3 --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.31,0.014632638658632896,0 +anli_r2,acc,0.354,0.015129868238451773,0 +anli_r3,acc,0.3375,0.013655897185463655,0 +arc_challenge,acc,0.30631399317406144,0.013470584417276513,0 +arc_challenge,acc_norm,0.32593856655290104,0.013697432466693246,0 +arc_easy,acc,0.627104377104377,0.009922743197129253,0 +arc_easy,acc_norm,0.6081649831649831,0.010016835016834974,0 +boolq,acc,0.6238532110091743,0.008472516562330725,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.43517730496453905,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4761999601672974,0.004984125363319067,0 +hellaswag,acc_norm,0.6349332802230632,0.004804649197163698,0 +piqa,acc,0.7568008705114254,0.010009611953858917,0 +piqa,acc_norm,0.7665941240478781,0.009869247889520994,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.909,0.009099549538400245,0 +sciq,acc_norm,0.898,0.009575368801653886,0 +storycloze_2016,acc,0.7183324425440941,0.010401844358587662,0 +winogrande,acc,0.5872138910812944,0.013837060648682094,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json deleted file mode 100644 index 1708dd012c29c1e30d9c22890a5587ac54ac9d2e..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.31, - "acc_stderr": 0.014632638658632896 - }, - "anli_r2": { - "acc": 0.354, - "acc_stderr": 0.015129868238451773 - }, - "anli_r3": { - "acc": 0.3375, - "acc_stderr": 0.013655897185463655 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.43517730496453905 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4761999601672974, - "acc_stderr": 0.004984125363319067, - "acc_norm": 0.6349332802230632, - "acc_norm_stderr": 0.004804649197163698 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.013837060648682094 - }, - "storycloze_2016": { - "acc": 0.7183324425440941, - "acc_stderr": 0.010401844358587662 - }, - "boolq": { - "acc": 0.6238532110091743, - "acc_stderr": 0.008472516562330725 - }, - "arc_easy": { - "acc": 0.627104377104377, - "acc_stderr": 0.009922743197129253, - "acc_norm": 0.6081649831649831, - "acc_norm_stderr": 0.010016835016834974 - }, - "arc_challenge": { - "acc": 0.30631399317406144, - "acc_stderr": 0.013470584417276513, - "acc_norm": 0.32593856655290104, - "acc_norm_stderr": 0.013697432466693246 - }, - "sciq": { - "acc": 0.909, - "acc_stderr": 0.009099549538400245, - "acc_norm": 0.898, - "acc_norm_stderr": 0.009575368801653886 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858917, - "acc_norm": 0.7665941240478781, - "acc_norm_stderr": 0.009869247889520994 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..967ec9d5929a10418cfc45c21b59ec409c8ad42a --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.342,0.015008706182121731,0 +anli_r2,acc,0.34,0.014987482264363937,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.3054607508532423,0.013460080478002501,0 +arc_challenge,acc_norm,0.33361774744027306,0.013778687054176546,0 +arc_easy,acc,0.6435185185185185,0.009828046544504422,0 +arc_easy,acc_norm,0.6165824915824916,0.009976995068264724,0 +boolq,acc,0.6321100917431193,0.008434276591093028,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.36394984326018814,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.47610037841067515,0.004984077906216097,0 +hellaswag,acc_norm,0.6333399721171081,0.004809077205343497,0 +piqa,acc,0.749183895538629,0.010113869547069044,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267312,0 +rte,acc,0.49097472924187724,0.030091559826331334,0 +sciq,acc,0.916,0.008776162089491123,0 +sciq,acc_norm,0.905,0.0092769101031033,0 +storycloze_2016,acc,0.721004810261892,0.010371620932652793,0 +winogrande,acc,0.5895816890292028,0.013825107120035866,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json deleted file mode 100644 index e5c60eedda79fdb2fc8087612d8d2dde1c68d2e4..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.342, - "acc_stderr": 0.015008706182121731 - }, - "anli_r2": { - "acc": 0.34, - "acc_stderr": 0.014987482264363937 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.36394984326018814 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.47610037841067515, - "acc_stderr": 0.004984077906216097, - "acc_norm": 0.6333399721171081, - "acc_norm_stderr": 0.004809077205343497 - }, - "rte": { - "acc": 0.49097472924187724, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.013825107120035866 - }, - "storycloze_2016": { - "acc": 0.721004810261892, - "acc_stderr": 0.010371620932652793 - }, - "boolq": { - "acc": 0.6321100917431193, - "acc_stderr": 0.008434276591093028 - }, - "arc_easy": { - "acc": 0.6435185185185185, - "acc_stderr": 0.009828046544504422, - "acc_norm": 0.6165824915824916, - "acc_norm_stderr": 0.009976995068264724 - }, - "arc_challenge": { - "acc": 0.3054607508532423, - "acc_stderr": 0.013460080478002501, - "acc_norm": 0.33361774744027306, - "acc_norm_stderr": 0.013778687054176546 - }, - "sciq": { - "acc": 0.916, - "acc_stderr": 0.008776162089491123, - "acc_norm": 0.905, - "acc_norm_stderr": 0.0092769101031033 - }, - "piqa": { - "acc": 0.749183895538629, - "acc_stderr": 0.010113869547069044, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267312 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5.csv b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..915e548f3b07c573fea23929171ddbc0d2446eed --- /dev/null +++ b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456732,0 +anli_r2,acc,0.346,0.015050266127564443,0 +anli_r3,acc,0.3258333333333333,0.01353542204341746,0 +arc_challenge,acc,0.3148464163822526,0.01357265770308495,0 +arc_challenge,acc_norm,0.3225255972696246,0.013659980894277366,0 +arc_easy,acc,0.6414141414141414,0.009840882301225297,0 +arc_easy,acc_norm,0.6136363636363636,0.009991296778159619,0 +boolq,acc,0.6256880733944954,0.008464246656443233,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.38268797942216715,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4753037243576977,0.0049836910991109115,0 +hellaswag,acc_norm,0.6382194781915953,0.004795337009118188,0 +piqa,acc,0.7584330794341676,0.009986718001804467,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.923,0.008434580140240643,0 +sciq,acc_norm,0.915,0.008823426366942314,0 +storycloze_2016,acc,0.7161945483698557,0.010425696279730922,0 +winogrande,acc,0.5974743488555643,0.013782866831703048,0 diff --git a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5_lm-eval_global_step80108_2023-02-24-21-45-59_5shots_backup.json b/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5_lm-eval_global_step80108_2023-02-24-21-45-59_5shots_backup.json deleted file mode 100644 index c8b42e7cc1c246782629160b48d2556cc5d71e54..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed2/evaluation/rankeval/4b284b21bc4seed2_5_lm-eval_global_step80108_2023-02-24-21-45-59_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456732 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.015050266127564443 - }, - "anli_r3": { - "acc": 0.3258333333333333, - "acc_stderr": 0.01353542204341746 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.38268797942216715 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4753037243576977, - "acc_stderr": 0.0049836910991109115, - "acc_norm": 0.6382194781915953, - "acc_norm_stderr": 0.004795337009118188 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.013782866831703048 - }, - "storycloze_2016": { - "acc": 0.7161945483698557, - "acc_stderr": 0.010425696279730922 - }, - "boolq": { - "acc": 0.6256880733944954, - "acc_stderr": 0.008464246656443233 - }, - "arc_easy": { - "acc": 0.6414141414141414, - "acc_stderr": 0.009840882301225297, - "acc_norm": 0.6136363636363636, - "acc_norm_stderr": 0.009991296778159619 - }, - "arc_challenge": { - "acc": 0.3148464163822526, - "acc_stderr": 0.01357265770308495, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.013659980894277366 - }, - "sciq": { - "acc": 0.923, - "acc_stderr": 0.008434580140240643, - "acc_norm": 0.915, - "acc_norm_stderr": 0.008823426366942314 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804467, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/generation/merged.csv b/4b284b21bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..9632fa8a415e45db71cd49e39bb5f2b7e3ba2cbb --- /dev/null +++ b/4b284b21bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.004517028931517297 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.004517028931517297 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19719568941249035 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19719568941249035 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22387083492928028 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22387083492928028 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23393615456741612 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23393615456741612 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23686616389681567 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23686616389681567 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23757613806327457 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23757613806327457 +e2e_nlg_cleaned,5,average,multiple,0.1889936683001324 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04779740432400501 +gem_xsum,0,median,rouge2_fmeasure,0.04779740432400501 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03675651772088566 +gem_xsum,1,median,rouge2_fmeasure,0.03675651772088566 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04085037987286443 +gem_xsum,2,median,rouge2_fmeasure,0.04085037987286443 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03976972720010616 +gem_xsum,3,median,rouge2_fmeasure,0.03976972720010616 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.011518858251746018 +gem_xsum,4,median,rouge2_fmeasure,0.011518858251746018 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0006390516220219271 +gem_xsum,5,median,rouge2_fmeasure,0.0006390516220219271 +gem_xsum,5,average,multiple,0.029555323165271534 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.052101576521840554 +web_nlg_en,0,median,rouge2_fmeasure,0.052101576521840554 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05919617667381684 +web_nlg_en,1,median,rouge2_fmeasure,0.05919617667381684 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05973961241278947 +web_nlg_en,2,median,rouge2_fmeasure,0.05973961241278947 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06110341345845585 +web_nlg_en,3,median,rouge2_fmeasure,0.06110341345845585 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.061203868765094024 +web_nlg_en,4,median,rouge2_fmeasure,0.061203868765094024 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.061162270106061005 +web_nlg_en,5,median,rouge2_fmeasure,0.061162270106061005 +web_nlg_en,5,average,multiple,0.059084486323009625 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04094718247147155 +wiki_lingua_en,0,median,rouge2_fmeasure,0.04094718247147155 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05398367197056974 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05398367197056974 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05522229843864866 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05522229843864866 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.044357885260241585 +wiki_lingua_en,3,median,rouge2_fmeasure,0.044357885260241585 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014135921586219569 +wiki_lingua_en,4,median,rouge2_fmeasure,0.014135921586219569 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002405387772695905 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002405387772695905 +wiki_lingua_en,5,average,multiple,0.0351753912499745 diff --git a/4b284b21bc4seed3/evaluation/generation/merged.json b/4b284b21bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..8634dfdd516129ade21c828f37638d66fe1832ec --- /dev/null +++ b/4b284b21bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.38535441961499517, "bleu_stderr": 0.03552663485548509, "rouge1_fmeasure": 0.11065558909994337, "rouge1_fmeasure_stderr": 0.0020016224462212732, "rouge1_precision": 0.07392314530016614, "rouge1_precision_stderr": 0.0017416928662543226, "rouge1_recall": 0.31161026039179884, "rouge1_recall_stderr": 0.0047814670849113125, "rouge2_fmeasure": 0.052101576521840554, "rouge2_fmeasure_stderr": 0.001254434060918415, "rouge2_precision": 0.03387963683203527, "rouge2_precision_stderr": 0.0009418685192906572, "rouge2_recall": 0.15167229813850266, "rouge2_recall_stderr": 0.003249849487394629, "rougeL_fmeasure": 0.10721119439428567, "rougeL_fmeasure_stderr": 0.001872014217214712, "rougeL_precision": 0.07143509621818969, "rougeL_precision_stderr": 0.0016292918867949191, "rougeL_recall": 0.30392322887986606, "rougeL_recall_stderr": 0.004670559568432556, "rougeLsum_fmeasure": 0.10596966360979974, "rougeLsum_fmeasure_stderr": 0.001881280054371966, "rougeLsum_precision": 0.0707943641461647, "rougeLsum_precision_stderr": 0.0016472302036406458, "rougeLsum_recall": 0.2986281388001843, "rougeLsum_recall_stderr": 0.0045364052998747}}, "1": {"PALM_prompt": {"bleu": 0.5512129467367316, "bleu_stderr": 0.0395418255366758, "rouge1_fmeasure": 0.1276921926781691, "rouge1_fmeasure_stderr": 0.0018832476710271726, "rouge1_precision": 0.08196746460318716, "rouge1_precision_stderr": 0.0014754369831595647, "rouge1_recall": 0.40914147014095936, "rouge1_recall_stderr": 0.0053861796031930845, "rouge2_fmeasure": 0.05919617667381684, "rouge2_fmeasure_stderr": 0.0012055006346489835, "rouge2_precision": 0.03760548245806836, "rouge2_precision_stderr": 0.0008485772447386252, "rouge2_recall": 0.2004779256099399, "rouge2_recall_stderr": 0.0039022297404803946, "rougeL_fmeasure": 0.12093972964797003, "rougeL_fmeasure_stderr": 0.0017235569265706195, "rougeL_precision": 0.07759729021254366, "rougeL_precision_stderr": 0.0013607454921227458, "rougeL_recall": 0.38809219654571864, "rougeL_recall_stderr": 0.005037385606363737, "rougeLsum_fmeasure": 0.12119435037639528, "rougeLsum_fmeasure_stderr": 0.001776101552985325, "rougeLsum_precision": 0.07790072285955618, "rougeLsum_precision_stderr": 0.00140465482895153, "rougeLsum_recall": 0.38623053862621287, "rougeLsum_recall_stderr": 0.004945962835138011}}, "2": {"PALM_prompt": {"bleu": 0.5933018254177376, "bleu_stderr": 0.032599794867965604, "rouge1_fmeasure": 0.12805421977747078, "rouge1_fmeasure_stderr": 0.0017731859601500346, "rouge1_precision": 0.08138207976281618, "rouge1_precision_stderr": 0.001320028581758322, "rouge1_recall": 0.41969083219702635, "rouge1_recall_stderr": 0.0051938217749552314, "rouge2_fmeasure": 0.05973961241278947, "rouge2_fmeasure_stderr": 0.0011408079330925724, "rouge2_precision": 0.03774533356166905, "rouge2_precision_stderr": 0.0008138836014742545, "rouge2_recall": 0.21183011597903806, "rouge2_recall_stderr": 0.0039019044052842713, "rougeL_fmeasure": 0.12034876582957013, "rougeL_fmeasure_stderr": 0.0016107770822709627, "rougeL_precision": 0.07647361432812638, "rougeL_precision_stderr": 0.001197053850440824, "rougeL_recall": 0.3943345910974271, "rougeL_recall_stderr": 0.004791104064669637, "rougeLsum_fmeasure": 0.12158519620974508, "rougeLsum_fmeasure_stderr": 0.0016694470344012931, "rougeLsum_precision": 0.07733964505952771, "rougeLsum_precision_stderr": 0.001246953646848645, "rougeLsum_recall": 0.3972043924885929, "rougeLsum_recall_stderr": 0.0047988833920575725}}, "3": {"PALM_prompt": {"bleu": 0.6676820645573561, "bleu_stderr": 0.029698241159243657, "rouge1_fmeasure": 0.12983004472435894, "rouge1_fmeasure_stderr": 0.0017904512351955197, "rouge1_precision": 0.08199443734871886, "rouge1_precision_stderr": 0.0013044083678416126, "rouge1_recall": 0.4326296696301956, "rouge1_recall_stderr": 0.005319027568329476, "rouge2_fmeasure": 0.06110341345845585, "rouge2_fmeasure_stderr": 0.0011495434475286368, "rouge2_precision": 0.03836456544897309, "rouge2_precision_stderr": 0.0008016019880007896, "rouge2_recall": 0.2182970907135364, "rouge2_recall_stderr": 0.0039434390311428045, "rougeL_fmeasure": 0.12124821447948028, "rougeL_fmeasure_stderr": 0.0016083744034722292, "rougeL_precision": 0.07660172161550755, "rougeL_precision_stderr": 0.0011750201739650409, "rougeL_recall": 0.4026463135770865, "rougeL_recall_stderr": 0.004800741095877209, "rougeLsum_fmeasure": 0.12313018758539385, "rougeLsum_fmeasure_stderr": 0.0016863121667983272, "rougeLsum_precision": 0.07783198204724436, "rougeLsum_precision_stderr": 0.0012330922500882204, "rougeLsum_recall": 0.40862496584299657, "rougeLsum_recall_stderr": 0.004878219295388798}}, "4": {"PALM_prompt": {"bleu": 0.6836151545167003, "bleu_stderr": 0.029089321693769803, "rouge1_fmeasure": 0.13090538213539582, "rouge1_fmeasure_stderr": 0.0017570477742261594, "rouge1_precision": 0.08237641519365944, "rouge1_precision_stderr": 0.0012712811379550952, "rouge1_recall": 0.4410392252234467, "rouge1_recall_stderr": 0.005252924753314178, "rouge2_fmeasure": 0.061203868765094024, "rouge2_fmeasure_stderr": 0.001147850507777741, "rouge2_precision": 0.038294697830682935, "rouge2_precision_stderr": 0.0007935145188603823, "rouge2_recall": 0.22196349128777046, "rouge2_recall_stderr": 0.0039874400036358035, "rougeL_fmeasure": 0.12140909248039315, "rougeL_fmeasure_stderr": 0.00158909093789567, "rougeL_precision": 0.07647361824568906, "rougeL_precision_stderr": 0.0011522230823326422, "rougeL_recall": 0.40690715092516466, "rougeL_recall_stderr": 0.0047151717706141886, "rougeLsum_fmeasure": 0.12407530125753383, "rougeLsum_fmeasure_stderr": 0.0016569917198277998, "rougeLsum_precision": 0.07814715491035078, "rougeLsum_precision_stderr": 0.0012030810779768226, "rougeLsum_recall": 0.4163700311086396, "rougeLsum_recall_stderr": 0.00483547149960936}}, "5": {"PALM_prompt": {"bleu": 0.770273022704267, "bleu_stderr": 0.034315297186777595, "rouge1_fmeasure": 0.13101067212098677, "rouge1_fmeasure_stderr": 0.0017066218312728107, "rouge1_precision": 0.08203708357705593, "rouge1_precision_stderr": 0.0012436710783140608, "rouge1_recall": 0.4527088578344478, "rouge1_recall_stderr": 0.005270512909014665, "rouge2_fmeasure": 0.061162270106061005, "rouge2_fmeasure_stderr": 0.001104511682001034, "rouge2_precision": 0.03796634902700147, "rouge2_precision_stderr": 0.0007628000916252061, "rouge2_recall": 0.23182459931655927, "rouge2_recall_stderr": 0.004137297126449473, "rougeL_fmeasure": 0.12018039624183414, "rougeL_fmeasure_stderr": 0.0015114350444937687, "rougeL_precision": 0.07531042150309193, "rougeL_precision_stderr": 0.0011067305255883584, "rougeL_recall": 0.41469386205754033, "rougeL_recall_stderr": 0.004724284504576538, "rougeLsum_fmeasure": 0.1235732065654381, "rougeLsum_fmeasure_stderr": 0.0015991738579748697, "rougeLsum_precision": 0.07740826323217029, "rougeLsum_precision_stderr": 0.0011667399100477348, "rougeLsum_recall": 0.426759509472668, "rougeLsum_recall_stderr": 0.004893799458515002}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.8921440063023223, "bleu_stderr": 0.07135112924486753, "rouge1_fmeasure": 0.18416362480898446, "rouge1_fmeasure_stderr": 0.0019831229958442082, "rouge1_precision": 0.15679162263674679, "rouge1_precision_stderr": 0.0019991369518845767, "rouge1_recall": 0.2689745748726192, "rouge1_recall_stderr": 0.0029016224540331552, "rouge2_fmeasure": 0.04094718247147155, "rouge2_fmeasure_stderr": 0.0009244238662338802, "rouge2_precision": 0.03467120760519669, "rouge2_precision_stderr": 0.0008221744008462381, "rouge2_recall": 0.06209758141565476, "rouge2_recall_stderr": 0.0015832249597357217, "rougeL_fmeasure": 0.14338902877501125, "rougeL_fmeasure_stderr": 0.0014226956074527863, "rougeL_precision": 0.12082089693911753, "rougeL_precision_stderr": 0.0014097474863449463, "rougeL_recall": 0.2144029920856489, "rougeL_recall_stderr": 0.002350903120057298, "rougeLsum_fmeasure": 0.1686525979999314, "rougeLsum_fmeasure_stderr": 0.0018124703517444088, "rougeLsum_precision": 0.1433977232378843, "rougeLsum_precision_stderr": 0.0018222721919399013, "rougeLsum_recall": 0.24699886538862653, "rougeLsum_recall_stderr": 0.0026942999677837982}}, "1": {"tldr_en": {"bleu": 2.8056438776711543, "bleu_stderr": 0.0837473028373607, "rouge1_fmeasure": 0.22027730929537712, "rouge1_fmeasure_stderr": 0.0019608589609497263, "rouge1_precision": 0.18997860617259896, "rouge1_precision_stderr": 0.0021315131916909724, "rouge1_recall": 0.31970644615930655, "rouge1_recall_stderr": 0.0028665402668271315, "rouge2_fmeasure": 0.05398367197056974, "rouge2_fmeasure_stderr": 0.0010308834876363582, "rouge2_precision": 0.046578501178857534, "rouge2_precision_stderr": 0.0009806080013812627, "rouge2_recall": 0.08133508293170587, "rouge2_recall_stderr": 0.0017593067161220152, "rougeL_fmeasure": 0.15563014047034032, "rougeL_fmeasure_stderr": 0.0013088183409122065, "rougeL_precision": 0.1329141418602316, "rougeL_precision_stderr": 0.0014073779291519839, "rougeL_recall": 0.23157078499816725, "rougeL_recall_stderr": 0.002239708119405446, "rougeLsum_fmeasure": 0.2072131798418733, "rougeLsum_fmeasure_stderr": 0.0018385432787880404, "rougeLsum_precision": 0.1786247909527116, "rougeLsum_precision_stderr": 0.0020029209788265186, "rougeLsum_recall": 0.3014126188767162, "rougeLsum_recall_stderr": 0.0027138736966177968}}, "2": {"tldr_en": {"bleu": 2.9780392590967417, "bleu_stderr": 0.07485005723665485, "rouge1_fmeasure": 0.22219895534581124, "rouge1_fmeasure_stderr": 0.0019115852755068978, "rouge1_precision": 0.19848212426092088, "rouge1_precision_stderr": 0.0023139570614517783, "rouge1_recall": 0.31608172323314615, "rouge1_recall_stderr": 0.0027622954997235665, "rouge2_fmeasure": 0.05522229843864866, "rouge2_fmeasure_stderr": 0.0010366643780952015, "rouge2_precision": 0.049990983654620055, "rouge2_precision_stderr": 0.0011071845883649219, "rouge2_recall": 0.08099563488003071, "rouge2_recall_stderr": 0.0017279951131225705, "rougeL_fmeasure": 0.15589253405198344, "rougeL_fmeasure_stderr": 0.0012973150218916173, "rougeL_precision": 0.1388108427148649, "rougeL_precision_stderr": 0.0016626769142365268, "rougeL_recall": 0.22710405861686703, "rougeL_recall_stderr": 0.0022002521830572098, "rougeLsum_fmeasure": 0.2100590256285955, "rougeLsum_fmeasure_stderr": 0.0018001298788378707, "rougeLsum_precision": 0.18757113799557207, "rougeLsum_precision_stderr": 0.0021914037253074584, "rougeLsum_recall": 0.2994878980311641, "rougeLsum_recall_stderr": 0.0026463012724863054}}, "3": {"tldr_en": {"bleu": 2.847512210211642, "bleu_stderr": 0.08130288880844025, "rouge1_fmeasure": 0.18197691858961074, "rouge1_fmeasure_stderr": 0.002206594668965237, "rouge1_precision": 0.17134837432784916, "rouge1_precision_stderr": 0.002648910653676966, "rouge1_recall": 0.25800000600290435, "rouge1_recall_stderr": 0.003313810821871368, "rouge2_fmeasure": 0.044357885260241585, "rouge2_fmeasure_stderr": 0.0009768044123671569, "rouge2_precision": 0.04161503586356084, "rouge2_precision_stderr": 0.0011270454940689701, "rouge2_recall": 0.06562904781876294, "rouge2_recall_stderr": 0.0016912238278118353, "rougeL_fmeasure": 0.13010623261113158, "rougeL_fmeasure_stderr": 0.0015513282273767694, "rougeL_precision": 0.12351815672543218, "rougeL_precision_stderr": 0.002041531952456252, "rougeL_recall": 0.18830813752138773, "rougeL_recall_stderr": 0.0025923821492611896, "rougeLsum_fmeasure": 0.17213683153876608, "rougeLsum_fmeasure_stderr": 0.0020822081122974664, "rougeLsum_precision": 0.16212518594924535, "rougeLsum_precision_stderr": 0.0025194852893668154, "rougeLsum_recall": 0.24439649503938565, "rougeLsum_recall_stderr": 0.0031617457131678586}}, "4": {"tldr_en": {"bleu": 0.6551089372553552, "bleu_stderr": 0.04895444931980532, "rouge1_fmeasure": 0.057107296289159025, "rouge1_fmeasure_stderr": 0.0019494380017288225, "rouge1_precision": 0.05513001019844826, "rouge1_precision_stderr": 0.00215178654051168, "rouge1_recall": 0.08454226315750968, "rouge1_recall_stderr": 0.002935250128250436, "rouge2_fmeasure": 0.014135921586219569, "rouge2_fmeasure_stderr": 0.0006914493317450905, "rouge2_precision": 0.013800421209577515, "rouge2_precision_stderr": 0.000858478532665541, "rouge2_recall": 0.021959372595934164, "rouge2_recall_stderr": 0.001194364786706014, "rougeL_fmeasure": 0.04216170229604133, "rougeL_fmeasure_stderr": 0.0014236179778487777, "rougeL_precision": 0.04128275764961394, "rougeL_precision_stderr": 0.0016823042515051513, "rougeL_recall": 0.0637605084786262, "rougeL_recall_stderr": 0.0022692310247988275, "rougeLsum_fmeasure": 0.05379079735955767, "rougeLsum_fmeasure_stderr": 0.0018282128528245283, "rougeLsum_precision": 0.051975929888249256, "rougeLsum_precision_stderr": 0.002028731950450667, "rougeLsum_recall": 0.07986443919003557, "rougeLsum_recall_stderr": 0.0027755366306270973}}, "5": {"tldr_en": {"bleu": 9.282109221159003e-07, "bleu_stderr": 2.2223612520502284e-06, "rouge1_fmeasure": 0.009311675141314384, "rouge1_fmeasure_stderr": 0.0008747369590243084, "rouge1_precision": 0.009676230361373621, "rouge1_precision_stderr": 0.0010092112696239383, "rouge1_recall": 0.014009287778858652, "rouge1_recall_stderr": 0.0013693091057835286, "rouge2_fmeasure": 0.002405387772695905, "rouge2_fmeasure_stderr": 0.00030042578765582695, "rouge2_precision": 0.002516778405932201, "rouge2_precision_stderr": 0.0003813647101277748, "rouge2_recall": 0.004189787625575326, "rouge2_recall_stderr": 0.0006261364681498638, "rougeL_fmeasure": 0.006932807281073693, "rougeL_fmeasure_stderr": 0.0006486704498845653, "rougeL_precision": 0.007375708880019688, "rougeL_precision_stderr": 0.0007995038887537955, "rougeL_recall": 0.010624417220719972, "rougeL_recall_stderr": 0.0010701965664864565, "rougeLsum_fmeasure": 0.008662236702193958, "rougeLsum_fmeasure_stderr": 0.0008161476026799892, "rougeLsum_precision": 0.009118809831121096, "rougeLsum_precision_stderr": 0.0009682743549182715, "rougeLsum_recall": 0.01302829985395723, "rougeLsum_recall_stderr": 0.0012830127121259097}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.6114974539510816, "bleu_stderr": 0.04527113203185075, "rouge1_fmeasure": 0.03492593073320387, "rouge1_fmeasure_stderr": 0.0010376513872745155, "rouge1_precision": 0.05869533205508546, "rouge1_precision_stderr": 0.001497949313805969, "rouge1_recall": 0.03571922348667764, "rouge1_recall_stderr": 0.0013948024749901261, "rouge2_fmeasure": 0.004517028931517297, "rouge2_fmeasure_stderr": 0.00036439768835932237, "rouge2_precision": 0.004063213985026975, "rouge2_precision_stderr": 0.0003541545577591494, "rouge2_recall": 0.006573047412918524, "rouge2_recall_stderr": 0.0005456263764429907, "rougeL_fmeasure": 0.03456601494605734, "rougeL_fmeasure_stderr": 0.0010229273732960667, "rougeL_precision": 0.05805625987920658, "rougeL_precision_stderr": 0.0014726688390761706, "rougeL_recall": 0.03532952206585685, "rougeL_recall_stderr": 0.0013716820040930184, "rougeLsum_fmeasure": 0.033356146432655866, "rougeLsum_fmeasure_stderr": 0.0009612249675140079, "rougeLsum_precision": 0.057416666705387066, "rougeLsum_precision_stderr": 0.001471227198507916, "rougeLsum_recall": 0.033493295914568566, "rougeLsum_recall_stderr": 0.0012685265746055889}}, "1": {"generate_text_restaurant": {"bleu": 11.33447911090928, "bleu_stderr": 0.14535102796873467, "rouge1_fmeasure": 0.433797396683994, "rouge1_fmeasure_stderr": 0.0022987607409721217, "rouge1_precision": 0.5083802018676299, "rouge1_precision_stderr": 0.0032791833916447554, "rouge1_recall": 0.42190893586204214, "rouge1_recall_stderr": 0.002971819042737358, "rouge2_fmeasure": 0.19719568941249035, "rouge2_fmeasure_stderr": 0.0019377495452543357, "rouge2_precision": 0.23512517549601986, "rouge2_precision_stderr": 0.0025920962511690763, "rouge2_recall": 0.19142011398550732, "rouge2_recall_stderr": 0.0021166855984481392, "rougeL_fmeasure": 0.31401062106699107, "rougeL_fmeasure_stderr": 0.0019834092080858665, "rougeL_precision": 0.37102243518103833, "rougeL_precision_stderr": 0.002904005844748298, "rougeL_recall": 0.30457624020087987, "rougeL_recall_stderr": 0.0024007135826919603, "rougeLsum_fmeasure": 0.3549188128484648, "rougeLsum_fmeasure_stderr": 0.002246841679471083, "rougeLsum_precision": 0.41705381489741356, "rougeLsum_precision_stderr": 0.003122960946648587, "rougeLsum_recall": 0.34480646003082416, "rougeLsum_recall_stderr": 0.0027214625950243783}}, "2": {"generate_text_restaurant": {"bleu": 12.915730606248113, "bleu_stderr": 0.20474125011289426, "rouge1_fmeasure": 0.4631809810602233, "rouge1_fmeasure_stderr": 0.002254496022709034, "rouge1_precision": 0.5479863902027133, "rouge1_precision_stderr": 0.0032369184668208516, "rouge1_recall": 0.441447062617084, "rouge1_recall_stderr": 0.0029152320911209556, "rouge2_fmeasure": 0.22387083492928028, "rouge2_fmeasure_stderr": 0.0019825042986668373, "rouge2_precision": 0.2694942093855683, "rouge2_precision_stderr": 0.002668904008037024, "rouge2_recall": 0.2131243378825482, "rouge2_recall_stderr": 0.0021529009824731303, "rougeL_fmeasure": 0.34081459166299144, "rougeL_fmeasure_stderr": 0.002030967494387809, "rougeL_precision": 0.4057365701684737, "rougeL_precision_stderr": 0.0029348734509084765, "rougeL_recall": 0.3240876063453124, "rougeL_recall_stderr": 0.002423564725696539, "rougeLsum_fmeasure": 0.382975161960982, "rougeLsum_fmeasure_stderr": 0.0022658855724966864, "rougeLsum_precision": 0.45419008644409004, "rougeLsum_precision_stderr": 0.0031526651706947012, "rougeLsum_recall": 0.3644738218767085, "rougeLsum_recall_stderr": 0.0027027920241602162}}, "3": {"generate_text_restaurant": {"bleu": 13.799049670661745, "bleu_stderr": 0.1818766642053644, "rouge1_fmeasure": 0.4733021068886725, "rouge1_fmeasure_stderr": 0.002229748270982301, "rouge1_precision": 0.5538154552713349, "rouge1_precision_stderr": 0.0032031754343585057, "rouge1_recall": 0.4530213630135714, "rouge1_recall_stderr": 0.002889071639596583, "rouge2_fmeasure": 0.23393615456741612, "rouge2_fmeasure_stderr": 0.0020284322092199397, "rouge2_precision": 0.2776711960124572, "rouge2_precision_stderr": 0.002669354319886314, "rouge2_recall": 0.22402395817155157, "rouge2_recall_stderr": 0.0022252252780088967, "rougeL_fmeasure": 0.34984277963163096, "rougeL_fmeasure_stderr": 0.0020650055785518843, "rougeL_precision": 0.4112498098162674, "rougeL_precision_stderr": 0.002912753431059431, "rougeL_recall": 0.33431398302211607, "rougeL_recall_stderr": 0.0024572409829041286, "rougeLsum_fmeasure": 0.3937728879658398, "rougeLsum_fmeasure_stderr": 0.0022887568890525446, "rougeLsum_precision": 0.46138958037109395, "rougeLsum_precision_stderr": 0.0031283915205367083, "rougeLsum_recall": 0.37645205860452513, "rougeLsum_recall_stderr": 0.0027190821967377665}}, "4": {"generate_text_restaurant": {"bleu": 14.332946979780699, "bleu_stderr": 0.187321161876581, "rouge1_fmeasure": 0.47698101030399137, "rouge1_fmeasure_stderr": 0.002304125507421037, "rouge1_precision": 0.5502767036745366, "rouge1_precision_stderr": 0.0032191855076160905, "rouge1_recall": 0.4601167935637563, "rouge1_recall_stderr": 0.002933097976024252, "rouge2_fmeasure": 0.23686616389681567, "rouge2_fmeasure_stderr": 0.0020891392412113624, "rouge2_precision": 0.2764817164388747, "rouge2_precision_stderr": 0.002683961252587485, "rouge2_recall": 0.22893953263224417, "rouge2_recall_stderr": 0.002301850243468405, "rougeL_fmeasure": 0.35299999087704836, "rougeL_fmeasure_stderr": 0.002138766539340371, "rougeL_precision": 0.4088077687087317, "rougeL_precision_stderr": 0.0029284110175963112, "rougeL_recall": 0.34034029264854043, "rougeL_recall_stderr": 0.002546438349753808, "rougeLsum_fmeasure": 0.3988932621808988, "rougeLsum_fmeasure_stderr": 0.002365288784954707, "rougeLsum_precision": 0.46022437266752997, "rougeLsum_precision_stderr": 0.0031504453494794983, "rougeLsum_recall": 0.38497978756866746, "rougeLsum_recall_stderr": 0.00282067010132265}}, "5": {"generate_text_restaurant": {"bleu": 14.123246792588205, "bleu_stderr": 0.18981316484333496, "rouge1_fmeasure": 0.4771521080848, "rouge1_fmeasure_stderr": 0.002247583418148515, "rouge1_precision": 0.5508575470527967, "rouge1_precision_stderr": 0.003172543361692212, "rouge1_recall": 0.4596250362933734, "rouge1_recall_stderr": 0.0028838674948470076, "rouge2_fmeasure": 0.23757613806327457, "rouge2_fmeasure_stderr": 0.0020631855681245196, "rouge2_precision": 0.27785561114819307, "rouge2_precision_stderr": 0.0026789931891456594, "rouge2_recall": 0.22893900538597906, "rouge2_recall_stderr": 0.002250810965306422, "rougeL_fmeasure": 0.35646844538814754, "rougeL_fmeasure_stderr": 0.002140481161608752, "rougeL_precision": 0.4128922710964347, "rougeL_precision_stderr": 0.0029408076764540847, "rougeL_recall": 0.3432096183345724, "rougeL_recall_stderr": 0.0025317938331892898, "rougeLsum_fmeasure": 0.4021976919673297, "rougeLsum_fmeasure_stderr": 0.0023234845320029575, "rougeLsum_precision": 0.46456472542175714, "rougeLsum_precision_stderr": 0.0031248622936883626, "rougeLsum_recall": 0.3874763961501519, "rougeLsum_recall_stderr": 0.0027737834865428587}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.949120361010708, "bleu_stderr": 0.0913394024073436, "rouge1_fmeasure": 0.20731019337768042, "rouge1_fmeasure_stderr": 0.0026329404650807355, "rouge1_precision": 0.15967577438396582, "rouge1_precision_stderr": 0.002367262573167645, "rouge1_recall": 0.3344097957704383, "rouge1_recall_stderr": 0.004542927765114456, "rouge2_fmeasure": 0.04779740432400501, "rouge2_fmeasure_stderr": 0.0016427567157124747, "rouge2_precision": 0.03607001702892568, "rouge2_precision_stderr": 0.001380428573706143, "rouge2_recall": 0.08085696005427423, "rouge2_recall_stderr": 0.0028599271086292063, "rougeL_fmeasure": 0.1576331231529663, "rougeL_fmeasure_stderr": 0.0019951539963787477, "rougeL_precision": 0.12124210013250766, "rougeL_precision_stderr": 0.0018353599935403793, "rougeL_recall": 0.25607740438919824, "rougeL_recall_stderr": 0.0036003883952627205, "rougeLsum_fmeasure": 0.16181781523790845, "rougeLsum_fmeasure_stderr": 0.0022319416867268445, "rougeLsum_precision": 0.1242181328811872, "rougeLsum_precision_stderr": 0.0019619556407432866, "rougeLsum_recall": 0.26337373173306655, "rougeLsum_recall_stderr": 0.004025744770152854}}, "1": {"article_DOC_summary": {"bleu": 1.3894434901616817, "bleu_stderr": 0.0696818577382309, "rouge1_fmeasure": 0.17081587570763854, "rouge1_fmeasure_stderr": 0.002498720216403539, "rouge1_precision": 0.12169091971501833, "rouge1_precision_stderr": 0.0018485062787122516, "rouge1_recall": 0.2985653730389386, "rouge1_recall_stderr": 0.004290053847052186, "rouge2_fmeasure": 0.03675651772088566, "rouge2_fmeasure_stderr": 0.0014655540573612757, "rouge2_precision": 0.02579297493046093, "rouge2_precision_stderr": 0.0010284490730061247, "rouge2_recall": 0.06657394965502363, "rouge2_recall_stderr": 0.0027184784375237805, "rougeL_fmeasure": 0.1393113978609222, "rougeL_fmeasure_stderr": 0.0019655986810200457, "rougeL_precision": 0.09903048460831103, "rougeL_precision_stderr": 0.0014377536580800821, "rougeL_recall": 0.24509693147391448, "rougeL_recall_stderr": 0.003518979975606831, "rougeLsum_fmeasure": 0.13592809650365684, "rougeLsum_fmeasure_stderr": 0.0021149395154476942, "rougeLsum_precision": 0.09661401587871966, "rougeLsum_precision_stderr": 0.0015465093492546976, "rougeLsum_recall": 0.23906516897153357, "rougeLsum_recall_stderr": 0.003723173758494015}}, "2": {"article_DOC_summary": {"bleu": 1.519028859842843, "bleu_stderr": 0.10932835465017057, "rouge1_fmeasure": 0.1812341104304252, "rouge1_fmeasure_stderr": 0.0024420789648274964, "rouge1_precision": 0.12932415674769882, "rouge1_precision_stderr": 0.0018169766878992155, "rouge1_recall": 0.31588294560858055, "rouge1_recall_stderr": 0.004216644148639617, "rouge2_fmeasure": 0.04085037987286443, "rouge2_fmeasure_stderr": 0.0014441866865235443, "rouge2_precision": 0.02882443972103835, "rouge2_precision_stderr": 0.0010221845705623207, "rouge2_recall": 0.07321267293223659, "rouge2_recall_stderr": 0.0026752083496981217, "rougeL_fmeasure": 0.1497403639141996, "rougeL_fmeasure_stderr": 0.0019366476236409818, "rougeL_precision": 0.10675121708053566, "rougeL_precision_stderr": 0.0014369855194212536, "rougeL_recall": 0.26192576739698237, "rougeL_recall_stderr": 0.003428288975677526, "rougeLsum_fmeasure": 0.1406991990555183, "rougeLsum_fmeasure_stderr": 0.0019931907678247163, "rougeLsum_precision": 0.10009767181496847, "rougeLsum_precision_stderr": 0.0014618075217660356, "rougeLsum_recall": 0.2471962036151299, "rougeLsum_recall_stderr": 0.003570649420240086}}, "3": {"article_DOC_summary": {"bleu": 1.6370716620193742, "bleu_stderr": 0.0965616345735323, "rouge1_fmeasure": 0.17398457366386533, "rouge1_fmeasure_stderr": 0.0026011912639339633, "rouge1_precision": 0.12669828356422383, "rouge1_precision_stderr": 0.0020094917781438, "rouge1_recall": 0.2991027968300709, "rouge1_recall_stderr": 0.004591383921824911, "rouge2_fmeasure": 0.03976972720010616, "rouge2_fmeasure_stderr": 0.0015117103431558975, "rouge2_precision": 0.02846440216262034, "rouge2_precision_stderr": 0.0010956946533866802, "rouge2_recall": 0.0706413058306802, "rouge2_recall_stderr": 0.0027868384974826573, "rougeL_fmeasure": 0.14481676263378276, "rougeL_fmeasure_stderr": 0.0021155906125953196, "rougeL_precision": 0.10522998362089304, "rougeL_precision_stderr": 0.0016066968369183466, "rougeL_recall": 0.24989448115090335, "rougeL_recall_stderr": 0.003818419767894932, "rougeLsum_fmeasure": 0.13514679304649532, "rougeLsum_fmeasure_stderr": 0.002182435634289256, "rougeLsum_precision": 0.09817348971720044, "rougeLsum_precision_stderr": 0.0016441110742716834, "rougeLsum_recall": 0.23394639249573773, "rougeLsum_recall_stderr": 0.0039702384387275}}, "4": {"article_DOC_summary": {"bleu": 0.8669272479971863, "bleu_stderr": 0.1405646989169273, "rouge1_fmeasure": 0.05007677852098019, "rouge1_fmeasure_stderr": 0.0028520829549217, "rouge1_precision": 0.04259238619227563, "rouge1_precision_stderr": 0.0027101471976600064, "rouge1_recall": 0.07720379689124833, "rouge1_recall_stderr": 0.0044603739276368026, "rouge2_fmeasure": 0.011518858251746018, "rouge2_fmeasure_stderr": 0.00100875710859823, "rouge2_precision": 0.01039550194215322, "rouge2_precision_stderr": 0.0013002683205402986, "rouge2_recall": 0.01810305312651028, "rouge2_recall_stderr": 0.0015721872697551358, "rougeL_fmeasure": 0.04100139142500078, "rougeL_fmeasure_stderr": 0.002305891008745547, "rougeL_precision": 0.03523709895823149, "rougeL_precision_stderr": 0.00227881816047799, "rougeL_recall": 0.06319492439650329, "rougeL_recall_stderr": 0.0036104810452688464, "rougeLsum_fmeasure": 0.040275422096666895, "rougeLsum_fmeasure_stderr": 0.0023372301427381265, "rougeLsum_precision": 0.03483815068281454, "rougeLsum_precision_stderr": 0.0023451054837002874, "rougeLsum_recall": 0.06212330010384987, "rougeLsum_recall_stderr": 0.00366876474124129}}, "5": {"article_DOC_summary": {"bleu": 1.0463937563324107e-36, "bleu_stderr": 3.980151871433784e-31, "rouge1_fmeasure": 0.0031002107990313337, "rouge1_fmeasure_stderr": 0.0008747175919556528, "rouge1_precision": 0.003588402424856072, "rouge1_precision_stderr": 0.001048591574930412, "rouge1_recall": 0.0028604653610329457, "rouge1_recall_stderr": 0.0007986614088290892, "rouge2_fmeasure": 0.0006390516220219271, "rouge2_fmeasure_stderr": 0.00030954190704298026, "rouge2_precision": 0.0007923025611704858, "rouge2_precision_stderr": 0.0003921450634363378, "rouge2_recall": 0.0005498039696152904, "rouge2_recall_stderr": 0.0002641981773098142, "rougeL_fmeasure": 0.002386725038982092, "rougeL_fmeasure_stderr": 0.0006780512190033147, "rougeL_precision": 0.0027648078129268614, "rougeL_precision_stderr": 0.0008127087627131919, "rougeL_recall": 0.0021867338334639443, "rougeL_recall_stderr": 0.0006127280790696822, "rougeLsum_fmeasure": 0.00262981517645366, "rougeLsum_fmeasure_stderr": 0.0007507411641340949, "rougeLsum_precision": 0.0030462311312536203, "rougeLsum_precision_stderr": 0.0009151504800039854, "rougeLsum_recall": 0.0024328059752266973, "rougeLsum_recall_stderr": 0.0006814073788898694}}}} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..2f506cde72ec346346df5f6fa0de58c284c00171 --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795027,0 +anli_r2,acc,0.329,0.014865395385928364,0 +anli_r3,acc,0.34833333333333333,0.013759437498874072,0 +arc_challenge,acc,0.2713310580204778,0.01299380772754579,0 +arc_challenge,acc_norm,0.295221843003413,0.013329750293382318,0 +arc_easy,acc,0.5909090909090909,0.010088775152615786,0 +arc_easy,acc_norm,0.5311447811447811,0.010239860250021745,0 +boolq,acc,0.6201834862385321,0.008488668235778617,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.3271604938271605,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.47410874327823144,0.004983087049281741,0 +hellaswag,acc_norm,0.619896434973113,0.004844199910173022,0 +piqa,acc,0.7595212187159956,0.009971345364651078,0 +piqa,acc_norm,0.764417845484222,0.009901067586473883,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.843,0.01151014697923019,0 +sciq,acc_norm,0.755,0.013607356839598123,0 +storycloze_2016,acc,0.726349545697488,0.0103097970944971,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json deleted file mode 100644 index 202e8a628e77ed9a6d201409499ca76c31bd688f..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795027 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928364 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874072 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.3271604938271605 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.47410874327823144, - "acc_stderr": 0.004983087049281741, - "acc_norm": 0.619896434973113, - "acc_norm_stderr": 0.004844199910173022 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.726349545697488, - "acc_stderr": 0.0103097970944971 - }, - "boolq": { - "acc": 0.6201834862385321, - "acc_stderr": 0.008488668235778617 - }, - "arc_easy": { - "acc": 0.5909090909090909, - "acc_stderr": 0.010088775152615786, - "acc_norm": 0.5311447811447811, - "acc_norm_stderr": 0.010239860250021745 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.01299380772754579, - "acc_norm": 0.295221843003413, - "acc_norm_stderr": 0.013329750293382318 - }, - "sciq": { - "acc": 0.843, - "acc_stderr": 0.01151014697923019, - "acc_norm": 0.755, - "acc_norm_stderr": 0.013607356839598123 - }, - "piqa": { - "acc": 0.7595212187159956, - "acc_stderr": 0.009971345364651078, - "acc_norm": 0.764417845484222, - "acc_norm_stderr": 0.009901067586473883 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..8ca3d398408eeba556a408d49f527f48b8888c43 --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.01487687202745673,0 +anli_r2,acc,0.318,0.0147340793093119,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.29436860068259385,0.013318528460539426,0 +arc_challenge,acc_norm,0.31569965870307165,0.013582571095815291,0 +arc_easy,acc,0.6077441077441077,0.010018744689650043,0 +arc_easy,acc_norm,0.5702861952861953,0.010157908005763674,0 +boolq,acc,0.6214067278287462,0.00848334171802448,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3333333333333333,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.46853216490738897,0.004979889597551665,0 +hellaswag,acc_norm,0.6188010356502689,0.004846886929763445,0 +piqa,acc,0.750272034820457,0.010099232969867483,0 +piqa,acc_norm,0.7573449401523396,0.010002002569708688,0 +rte,acc,0.5523465703971119,0.02993107036293953,0 +sciq,acc,0.887,0.010016552866696855,0 +sciq,acc_norm,0.856,0.01110798754893915,0 +storycloze_2016,acc,0.7145911277391769,0.010443395884062115,0 +winogrande,acc,0.580110497237569,0.013870943986310391,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json deleted file mode 100644 index c092ca2613a7e0c3f70e027ce8d975f7238094e5..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - }, - "anli_r2": { - "acc": 0.318, - "acc_stderr": 0.0147340793093119 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3333333333333333 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.46853216490738897, - "acc_stderr": 0.004979889597551665, - "acc_norm": 0.6188010356502689, - "acc_norm_stderr": 0.004846886929763445 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.02993107036293953 - }, - "winogrande": { - "acc": 0.580110497237569, - "acc_stderr": 0.013870943986310391 - }, - "storycloze_2016": { - "acc": 0.7145911277391769, - "acc_stderr": 0.010443395884062115 - }, - "boolq": { - "acc": 0.6214067278287462, - "acc_stderr": 0.00848334171802448 - }, - "arc_easy": { - "acc": 0.6077441077441077, - "acc_stderr": 0.010018744689650043, - "acc_norm": 0.5702861952861953, - "acc_norm_stderr": 0.010157908005763674 - }, - "arc_challenge": { - "acc": 0.29436860068259385, - "acc_stderr": 0.013318528460539426, - "acc_norm": 0.31569965870307165, - "acc_norm_stderr": 0.013582571095815291 - }, - "sciq": { - "acc": 0.887, - "acc_stderr": 0.010016552866696855, - "acc_norm": 0.856, - "acc_norm_stderr": 0.01110798754893915 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867483, - "acc_norm": 0.7573449401523396, - "acc_norm_stderr": 0.010002002569708688 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3497e872011bb4aef3a941d77b66036aacdfe9e1 --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811475,0 +anli_r2,acc,0.333,0.014910846164229864,0 +anli_r3,acc,0.34,0.013680495725767797,0 +arc_challenge,acc,0.295221843003413,0.01332975029338232,0 +arc_challenge,acc_norm,0.30802047781569963,0.013491429517292038,0 +arc_easy,acc,0.6195286195286195,0.009962305992058577,0 +arc_easy,acc_norm,0.5959595959595959,0.01006906164954955,0 +boolq,acc,0.6180428134556575,0.00849785199842719,1 +cb,acc,0.35714285714285715,0.06460957383809218,1 +cb,f1,0.2528248587570622,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.46932881896036643,0.004980384575535383,0 +hellaswag,acc_norm,0.6172077275443139,0.0048507486878599185,0 +piqa,acc,0.7464635473340587,0.010150090834551794,0 +piqa,acc_norm,0.7573449401523396,0.010002002569708688,0 +rte,acc,0.4693140794223827,0.03003973059219781,0 +sciq,acc,0.896,0.009658016218524301,0 +sciq,acc_norm,0.871,0.010605256784796586,0 +storycloze_2016,acc,0.7183324425440941,0.010401844358587665,0 +winogrande,acc,0.5927387529597474,0.013808654122417862,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json deleted file mode 100644 index 42328347da9ac54bcf526e47084be1d4427cae43..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811475 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229864 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767797 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809218, - "f1": 0.2528248587570622 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.46932881896036643, - "acc_stderr": 0.004980384575535383, - "acc_norm": 0.6172077275443139, - "acc_norm_stderr": 0.0048507486878599185 - }, - "rte": { - "acc": 0.4693140794223827, - "acc_stderr": 0.03003973059219781 - }, - "winogrande": { - "acc": 0.5927387529597474, - "acc_stderr": 0.013808654122417862 - }, - "storycloze_2016": { - "acc": 0.7183324425440941, - "acc_stderr": 0.010401844358587665 - }, - "boolq": { - "acc": 0.6180428134556575, - "acc_stderr": 0.00849785199842719 - }, - "arc_easy": { - "acc": 0.6195286195286195, - "acc_stderr": 0.009962305992058577, - "acc_norm": 0.5959595959595959, - "acc_norm_stderr": 0.01006906164954955 - }, - "arc_challenge": { - "acc": 0.295221843003413, - "acc_stderr": 0.01332975029338232, - "acc_norm": 0.30802047781569963, - "acc_norm_stderr": 0.013491429517292038 - }, - "sciq": { - "acc": 0.896, - "acc_stderr": 0.009658016218524301, - "acc_norm": 0.871, - "acc_norm_stderr": 0.010605256784796586 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551794, - "acc_norm": 0.7573449401523396, - "acc_norm_stderr": 0.010002002569708688 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..adbbbb7a5c5331a11a53d9ddd5258cac54145a29 --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270333,0 +anli_r2,acc,0.351,0.015100563798316409,0 +anli_r3,acc,0.34833333333333333,0.013759437498874086,0 +arc_challenge,acc,0.30119453924914674,0.01340674176784762,0 +arc_challenge,acc_norm,0.31143344709897613,0.013532472099850947,0 +arc_easy,acc,0.6165824915824916,0.009976995068264717,0 +arc_easy,acc_norm,0.6060606060606061,0.010026305355981814,0 +boolq,acc,0.6058103975535168,0.008546995661233634,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.3815668202764977,,1 +copa,acc,0.8,0.04020151261036845,0 +hellaswag,acc,0.46863174666401114,0.004979952166595543,0 +hellaswag,acc_norm,0.6213901613224457,0.0048404936031661945,0 +piqa,acc,0.7611534276387377,0.0099481203853375,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.903,0.009363689373248088,0 +sciq,acc_norm,0.891,0.009859828407037185,0 +storycloze_2016,acc,0.7258150721539284,0.010316062787590006,0 +winogrande,acc,0.574585635359116,0.013895257666646378,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json deleted file mode 100644 index 2a00a3b600e1cb988fff6461cd325153c7c9819e..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270333 - }, - "anli_r2": { - "acc": 0.351, - "acc_stderr": 0.015100563798316409 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874086 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.3815668202764977 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036845 - }, - "hellaswag": { - "acc": 0.46863174666401114, - "acc_stderr": 0.004979952166595543, - "acc_norm": 0.6213901613224457, - "acc_norm_stderr": 0.0048404936031661945 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646378 - }, - "storycloze_2016": { - "acc": 0.7258150721539284, - "acc_stderr": 0.010316062787590006 - }, - "boolq": { - "acc": 0.6058103975535168, - "acc_stderr": 0.008546995661233634 - }, - "arc_easy": { - "acc": 0.6165824915824916, - "acc_stderr": 0.009976995068264717, - "acc_norm": 0.6060606060606061, - "acc_norm_stderr": 0.010026305355981814 - }, - "arc_challenge": { - "acc": 0.30119453924914674, - "acc_stderr": 0.01340674176784762, - "acc_norm": 0.31143344709897613, - "acc_norm_stderr": 0.013532472099850947 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.009363689373248088, - "acc_norm": 0.891, - "acc_norm_stderr": 0.009859828407037185 - }, - "piqa": { - "acc": 0.7611534276387377, - "acc_stderr": 0.0099481203853375, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..4bf5fdbaa0104900a217b7d6a8c006b70761e167 --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.353,0.015120172605483699,0 +anli_r2,acc,0.36,0.015186527932040127,0 +anli_r3,acc,0.3641666666666667,0.013896714966807265,0 +arc_challenge,acc,0.3003412969283277,0.013395909309956999,0 +arc_challenge,acc_norm,0.3148464163822526,0.01357265770308495,0 +arc_easy,acc,0.6233164983164983,0.00994284807747617,0 +arc_easy,acc_norm,0.6077441077441077,0.010018744689650043,0 +boolq,acc,0.6220183486238532,0.008480656964585248,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.4129474011826953,,1 +copa,acc,0.78,0.041633319989322626,0 +hellaswag,acc,0.466938856801434,0.004978861409119803,0 +hellaswag,acc_norm,0.6222863971320454,0.004838246410786256,0 +piqa,acc,0.7524483133841132,0.010069703966857106,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804453,0 +rte,acc,0.49097472924187724,0.030091559826331334,0 +sciq,acc,0.904,0.009320454434783248,0 +sciq,acc_norm,0.898,0.009575368801653897,0 +storycloze_2016,acc,0.7300908605024051,0.01026541350322146,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json deleted file mode 100644 index feb5094ffb22482946bbcd9bbd20d27c44b5f997..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.353, - "acc_stderr": 0.015120172605483699 - }, - "anli_r2": { - "acc": 0.36, - "acc_stderr": 0.015186527932040127 - }, - "anli_r3": { - "acc": 0.3641666666666667, - "acc_stderr": 0.013896714966807265 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.4129474011826953 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.041633319989322626 - }, - "hellaswag": { - "acc": 0.466938856801434, - "acc_stderr": 0.004978861409119803, - "acc_norm": 0.6222863971320454, - "acc_norm_stderr": 0.004838246410786256 - }, - "rte": { - "acc": 0.49097472924187724, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7300908605024051, - "acc_stderr": 0.01026541350322146 - }, - "boolq": { - "acc": 0.6220183486238532, - "acc_stderr": 0.008480656964585248 - }, - "arc_easy": { - "acc": 0.6233164983164983, - "acc_stderr": 0.00994284807747617, - "acc_norm": 0.6077441077441077, - "acc_norm_stderr": 0.010018744689650043 - }, - "arc_challenge": { - "acc": 0.3003412969283277, - "acc_stderr": 0.013395909309956999, - "acc_norm": 0.3148464163822526, - "acc_norm_stderr": 0.01357265770308495 - }, - "sciq": { - "acc": 0.904, - "acc_stderr": 0.009320454434783248, - "acc_norm": 0.898, - "acc_norm_stderr": 0.009575368801653897 - }, - "piqa": { - "acc": 0.7524483133841132, - "acc_stderr": 0.010069703966857106, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804453 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5.csv b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..896c885250a5a7c862b5a762c6fe6f031651091f --- /dev/null +++ b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.361,0.015195720118175124,0 +anli_r2,acc,0.332,0.014899597242811475,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.30631399317406144,0.013470584417276513,0 +arc_challenge,acc_norm,0.32337883959044367,0.013669421630012122,0 +arc_easy,acc,0.6245791245791246,0.0099362185271143,0 +arc_easy,acc_norm,0.6199494949494949,0.009960175831493131,0 +boolq,acc,0.6152905198776758,0.008509403073229692,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3336203597397627,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4676359290977893,0.004979317515432522,0 +hellaswag,acc_norm,0.6258713403704441,0.004829081532826523,0 +piqa,acc,0.7529923830250272,0.010062268140772629,0 +piqa,acc_norm,0.7540805223068553,0.010047331865625184,0 +rte,acc,0.5054151624548736,0.030094698123239966,0 +sciq,acc,0.913,0.008916866630745913,0 +sciq,acc_norm,0.904,0.009320454434783217,0 +storycloze_2016,acc,0.721004810261892,0.010371620932652793,0 +winogrande,acc,0.5816890292028414,0.013863669961195908,0 diff --git a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json b/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json deleted file mode 100644 index 0b980e8b40156ef5badcb4720f1f9b73674485fb..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed3/evaluation/rankeval/4b284b21bc4seed3_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.361, - "acc_stderr": 0.015195720118175124 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811475 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3336203597397627 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4676359290977893, - "acc_stderr": 0.004979317515432522, - "acc_norm": 0.6258713403704441, - "acc_norm_stderr": 0.004829081532826523 - }, - "rte": { - "acc": 0.5054151624548736, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5816890292028414, - "acc_stderr": 0.013863669961195908 - }, - "storycloze_2016": { - "acc": 0.721004810261892, - "acc_stderr": 0.010371620932652793 - }, - "boolq": { - "acc": 0.6152905198776758, - "acc_stderr": 0.008509403073229692 - }, - "arc_easy": { - "acc": 0.6245791245791246, - "acc_stderr": 0.0099362185271143, - "acc_norm": 0.6199494949494949, - "acc_norm_stderr": 0.009960175831493131 - }, - "arc_challenge": { - "acc": 0.30631399317406144, - "acc_stderr": 0.013470584417276513, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012122 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.008916866630745913, - "acc_norm": 0.904, - "acc_norm_stderr": 0.009320454434783217 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772629, - "acc_norm": 0.7540805223068553, - "acc_norm_stderr": 0.010047331865625184 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/generation/merged.csv b/4b284b21bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d10b31c5853482d69d8ec7f71a7d1fb68640e97f --- /dev/null +++ b/4b284b21bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.001114996015022125 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.001114996015022125 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.13429472916574672 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.13429472916574672 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.17540917102638537 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.17540917102638537 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19794956049387477 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19794956049387477 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20632107444632167 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20632107444632167 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.20666903463506434 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.20666903463506434 +e2e_nlg_cleaned,5,average,multiple,0.1536264276304025 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0474651150579007 +gem_xsum,0,median,rouge2_fmeasure,0.0474651150579007 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03433659588366439 +gem_xsum,1,median,rouge2_fmeasure,0.03433659588366439 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03828129617404584 +gem_xsum,2,median,rouge2_fmeasure,0.03828129617404584 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03719404523373464 +gem_xsum,3,median,rouge2_fmeasure,0.03719404523373464 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009927038748098277 +gem_xsum,4,median,rouge2_fmeasure,0.009927038748098277 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.000983708450751863 +gem_xsum,5,median,rouge2_fmeasure,0.000983708450751863 +gem_xsum,5,average,multiple,0.028031299924699285 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05052209059024575 +web_nlg_en,0,median,rouge2_fmeasure,0.05052209059024575 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05531307099139748 +web_nlg_en,1,median,rouge2_fmeasure,0.05531307099139748 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.0570319859750151 +web_nlg_en,2,median,rouge2_fmeasure,0.0570319859750151 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05661337603042263 +web_nlg_en,3,median,rouge2_fmeasure,0.05661337603042263 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.058223377247645175 +web_nlg_en,4,median,rouge2_fmeasure,0.058223377247645175 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.058684632135392434 +web_nlg_en,5,median,rouge2_fmeasure,0.058684632135392434 +web_nlg_en,5,average,multiple,0.05606475549501976 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.035498190179559366 +wiki_lingua_en,0,median,rouge2_fmeasure,0.035498190179559366 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.056810181676750955 +wiki_lingua_en,1,median,rouge2_fmeasure,0.056810181676750955 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05419402589748163 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05419402589748163 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04352751746326409 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04352751746326409 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014698951085450463 +wiki_lingua_en,4,median,rouge2_fmeasure,0.014698951085450463 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002073109168721178 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002073109168721178 +wiki_lingua_en,5,average,multiple,0.03446699591187128 diff --git a/4b284b21bc4seed4/evaluation/generation/merged.json b/4b284b21bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..a03f52f8e406fc70f48bf9f297d2190b79432be2 --- /dev/null +++ b/4b284b21bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3751221736027223, "bleu_stderr": 0.04603981713785397, "rouge1_fmeasure": 0.10655282549139757, "rouge1_fmeasure_stderr": 0.002069892427617646, "rouge1_precision": 0.07050754951097193, "rouge1_precision_stderr": 0.0015950749741378038, "rouge1_recall": 0.2930621241248632, "rouge1_recall_stderr": 0.0047039720667729585, "rouge2_fmeasure": 0.05052209059024575, "rouge2_fmeasure_stderr": 0.0012866888304456589, "rouge2_precision": 0.03327005966333133, "rouge2_precision_stderr": 0.0009772300003902292, "rouge2_recall": 0.14224611225552608, "rouge2_recall_stderr": 0.0031500776280318555, "rougeL_fmeasure": 0.10267935520945128, "rougeL_fmeasure_stderr": 0.0019389896303185993, "rougeL_precision": 0.0677233404229661, "rougeL_precision_stderr": 0.0014833039402630307, "rougeL_recall": 0.28507940697548506, "rougeL_recall_stderr": 0.004611898165464154, "rougeLsum_fmeasure": 0.10179374812954574, "rougeLsum_fmeasure_stderr": 0.0019553887116743455, "rougeLsum_precision": 0.06739541442277049, "rougeLsum_precision_stderr": 0.0015164320427077738, "rougeLsum_recall": 0.27995528365978256, "rougeLsum_recall_stderr": 0.004446276996131344}}, "1": {"PALM_prompt": {"bleu": 0.5198830574890386, "bleu_stderr": 0.03504017360483535, "rouge1_fmeasure": 0.12070700921084229, "rouge1_fmeasure_stderr": 0.002002150459250641, "rouge1_precision": 0.07738811110777072, "rouge1_precision_stderr": 0.0015304988936609128, "rouge1_recall": 0.3862577997016925, "rouge1_recall_stderr": 0.00572058610283539, "rouge2_fmeasure": 0.05531307099139748, "rouge2_fmeasure_stderr": 0.0012137549494064656, "rouge2_precision": 0.03507152315512706, "rouge2_precision_stderr": 0.0008467381062339321, "rouge2_recall": 0.18709836433905874, "rouge2_recall_stderr": 0.003833792467120382, "rougeL_fmeasure": 0.11194710382233043, "rougeL_fmeasure_stderr": 0.0017263614451165634, "rougeL_precision": 0.07158564602943736, "rougeL_precision_stderr": 0.0013356095688320767, "rougeL_recall": 0.3620899416623159, "rougeL_recall_stderr": 0.005273165348503305, "rougeLsum_fmeasure": 0.11393170029640054, "rougeLsum_fmeasure_stderr": 0.0018446583441229345, "rougeLsum_precision": 0.07309280304036243, "rougeLsum_precision_stderr": 0.0014294724139865343, "rougeLsum_recall": 0.36442715703356776, "rougeLsum_recall_stderr": 0.005241997303110117}}, "2": {"PALM_prompt": {"bleu": 0.5772980788839762, "bleu_stderr": 0.0351401276773091, "rouge1_fmeasure": 0.12373316030300299, "rouge1_fmeasure_stderr": 0.0018974475884765722, "rouge1_precision": 0.07813906472896695, "rouge1_precision_stderr": 0.0013627013823755202, "rouge1_recall": 0.4101295780853266, "rouge1_recall_stderr": 0.005693087311032938, "rouge2_fmeasure": 0.0570319859750151, "rouge2_fmeasure_stderr": 0.0011521977790571885, "rouge2_precision": 0.03581354421318794, "rouge2_precision_stderr": 0.0008022617515173707, "rouge2_recall": 0.2032111746093155, "rouge2_recall_stderr": 0.003954708803559, "rougeL_fmeasure": 0.11350896035348979, "rougeL_fmeasure_stderr": 0.0016169887067107285, "rougeL_precision": 0.07158825167996366, "rougeL_precision_stderr": 0.0011655724191791303, "rougeL_recall": 0.3794140508633469, "rougeL_recall_stderr": 0.005145968326494213, "rougeLsum_fmeasure": 0.11690863992503238, "rougeLsum_fmeasure_stderr": 0.0017500511668341343, "rougeLsum_precision": 0.07387016720939305, "rougeLsum_precision_stderr": 0.0012627844516927061, "rougeLsum_recall": 0.38737624946784815, "rougeLsum_recall_stderr": 0.005248864126942714}}, "3": {"PALM_prompt": {"bleu": 0.6456226857113138, "bleu_stderr": 0.040165536189754096, "rouge1_fmeasure": 0.12436437584313036, "rouge1_fmeasure_stderr": 0.001814950124577219, "rouge1_precision": 0.0778881396479436, "rouge1_precision_stderr": 0.0012769567386518232, "rouge1_recall": 0.42296024366383517, "rouge1_recall_stderr": 0.00569044382493207, "rouge2_fmeasure": 0.05661337603042263, "rouge2_fmeasure_stderr": 0.001071749411178637, "rouge2_precision": 0.03521828535740504, "rouge2_precision_stderr": 0.0007298691703855176, "rouge2_recall": 0.20809711310932535, "rouge2_recall_stderr": 0.003931907827643346, "rougeL_fmeasure": 0.11292204203780767, "rougeL_fmeasure_stderr": 0.0015573442585516018, "rougeL_precision": 0.07076272874827239, "rougeL_precision_stderr": 0.0011113411679509696, "rougeL_recall": 0.3866270125315969, "rougeL_recall_stderr": 0.005107912406441445, "rougeLsum_fmeasure": 0.1168466330708585, "rougeLsum_fmeasure_stderr": 0.0016763512625120776, "rougeLsum_precision": 0.0732788083787352, "rougeLsum_precision_stderr": 0.0011890321940136864, "rougeLsum_recall": 0.39617638119143006, "rougeLsum_recall_stderr": 0.0051662731631678}}, "4": {"PALM_prompt": {"bleu": 0.7394949248334012, "bleu_stderr": 0.0375339718418186, "rouge1_fmeasure": 0.12719292338738244, "rouge1_fmeasure_stderr": 0.0017548996669778086, "rouge1_precision": 0.07935954064608862, "rouge1_precision_stderr": 0.0012440228309059662, "rouge1_recall": 0.44189075126537175, "rouge1_recall_stderr": 0.0056160164221400995, "rouge2_fmeasure": 0.058223377247645175, "rouge2_fmeasure_stderr": 0.001073250860654579, "rouge2_precision": 0.03609299322498633, "rouge2_precision_stderr": 0.0007391134009433947, "rouge2_recall": 0.21980568290237434, "rouge2_recall_stderr": 0.004020542118197972, "rougeL_fmeasure": 0.1144359922322097, "rougeL_fmeasure_stderr": 0.0014869489906081234, "rougeL_precision": 0.07145742950585998, "rougeL_precision_stderr": 0.0010699196551391536, "rougeL_recall": 0.3991369324047731, "rougeL_recall_stderr": 0.0049417529681389525, "rougeLsum_fmeasure": 0.11931787390462796, "rougeLsum_fmeasure_stderr": 0.0016294893480413802, "rougeLsum_precision": 0.07454021703025861, "rougeLsum_precision_stderr": 0.0011655709781687874, "rougeLsum_recall": 0.41368909933661674, "rougeLsum_recall_stderr": 0.005127640598028314}}, "5": {"PALM_prompt": {"bleu": 0.7350884275632563, "bleu_stderr": 0.04285525370548216, "rouge1_fmeasure": 0.12790081410584203, "rouge1_fmeasure_stderr": 0.001762758952960184, "rouge1_precision": 0.07963440574460627, "rouge1_precision_stderr": 0.0012450999827039412, "rouge1_recall": 0.450750421306443, "rouge1_recall_stderr": 0.005688854042032203, "rouge2_fmeasure": 0.058684632135392434, "rouge2_fmeasure_stderr": 0.0010628257100155462, "rouge2_precision": 0.03626337631746513, "rouge2_precision_stderr": 0.0007247188269312321, "rouge2_recall": 0.22702405178339297, "rouge2_recall_stderr": 0.004195235523736308, "rougeL_fmeasure": 0.11440641190075858, "rougeL_fmeasure_stderr": 0.0014753694031490337, "rougeL_precision": 0.07125643393688096, "rougeL_precision_stderr": 0.00105707615542693, "rougeL_recall": 0.4063747251874938, "rougeL_recall_stderr": 0.005069676186964856, "rougeLsum_fmeasure": 0.11957698534047122, "rougeLsum_fmeasure_stderr": 0.0016217012306438583, "rougeLsum_precision": 0.07452930873504321, "rougeLsum_precision_stderr": 0.0011537233047546483, "rougeLsum_recall": 0.42116829387113064, "rougeLsum_recall_stderr": 0.005204895411040752}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5445764212442845, "bleu_stderr": 0.046348567423538295, "rouge1_fmeasure": 0.17846806393672296, "rouge1_fmeasure_stderr": 0.0018156572490314613, "rouge1_precision": 0.15229529499726693, "rouge1_precision_stderr": 0.0018532935787449235, "rouge1_recall": 0.2584223718261279, "rouge1_recall_stderr": 0.0025516613510239644, "rouge2_fmeasure": 0.035498190179559366, "rouge2_fmeasure_stderr": 0.0008425007414760624, "rouge2_precision": 0.030189999709011046, "rouge2_precision_stderr": 0.0007527178027814116, "rouge2_recall": 0.05276107785013939, "rouge2_recall_stderr": 0.0013537005512275853, "rougeL_fmeasure": 0.13992548854671796, "rougeL_fmeasure_stderr": 0.0012925170885687428, "rougeL_precision": 0.11790693896267036, "rougeL_precision_stderr": 0.001294367886229787, "rougeL_recall": 0.20826559324975696, "rougeL_recall_stderr": 0.0020968023223764095, "rougeLsum_fmeasure": 0.16300806309885615, "rougeLsum_fmeasure_stderr": 0.0016517242441511555, "rougeLsum_precision": 0.1388827214194179, "rougeLsum_precision_stderr": 0.0016844636725997805, "rougeLsum_recall": 0.2370411466342553, "rougeLsum_recall_stderr": 0.002357902527598024}}, "1": {"tldr_en": {"bleu": 2.9354736259244234, "bleu_stderr": 0.08184998217131226, "rouge1_fmeasure": 0.22871763440008758, "rouge1_fmeasure_stderr": 0.0019504250194142293, "rouge1_precision": 0.19660419529835046, "rouge1_precision_stderr": 0.0021703512663018435, "rouge1_recall": 0.33305075642202464, "rouge1_recall_stderr": 0.0027422057559937344, "rouge2_fmeasure": 0.056810181676750955, "rouge2_fmeasure_stderr": 0.0010692773805419653, "rouge2_precision": 0.048646612266404474, "rouge2_precision_stderr": 0.0010079743018688542, "rouge2_recall": 0.08571356139913104, "rouge2_recall_stderr": 0.0017928353996558705, "rougeL_fmeasure": 0.1584532961497466, "rougeL_fmeasure_stderr": 0.0012813295440813953, "rougeL_precision": 0.13473919020916866, "rougeL_precision_stderr": 0.0014138439503997033, "rougeL_recall": 0.2377264850157444, "rougeL_recall_stderr": 0.0022081866868395457, "rougeLsum_fmeasure": 0.21440206759227748, "rougeLsum_fmeasure_stderr": 0.0018298647451743082, "rougeLsum_precision": 0.18408931555378133, "rougeLsum_precision_stderr": 0.0020314434795525877, "rougeLsum_recall": 0.3130915982176866, "rougeLsum_recall_stderr": 0.002608677189171684}}, "2": {"tldr_en": {"bleu": 2.988336179481119, "bleu_stderr": 0.07284534182808335, "rouge1_fmeasure": 0.2216752304780583, "rouge1_fmeasure_stderr": 0.0018496410521841634, "rouge1_precision": 0.19308251629949974, "rouge1_precision_stderr": 0.0021372296672672315, "rouge1_recall": 0.3226572993341573, "rouge1_recall_stderr": 0.002715283947112145, "rouge2_fmeasure": 0.05419402589748163, "rouge2_fmeasure_stderr": 0.0010092487989924672, "rouge2_precision": 0.04755808223353882, "rouge2_precision_stderr": 0.001056965112675896, "rouge2_recall": 0.08157607236301773, "rouge2_recall_stderr": 0.0016928843275132437, "rougeL_fmeasure": 0.15534336307223465, "rougeL_fmeasure_stderr": 0.0012446244313757237, "rougeL_precision": 0.13475955724902108, "rougeL_precision_stderr": 0.0015213837884005157, "rougeL_recall": 0.23151769344902068, "rougeL_recall_stderr": 0.002143281502769755, "rougeLsum_fmeasure": 0.2092469074780288, "rougeLsum_fmeasure_stderr": 0.0017297088792391818, "rougeLsum_precision": 0.18213738467572518, "rougeLsum_precision_stderr": 0.0020147435204007475, "rougeLsum_recall": 0.3053278835262653, "rougeLsum_recall_stderr": 0.0025811768541287375}}, "3": {"tldr_en": {"bleu": 2.955313094852994, "bleu_stderr": 0.0766356215689601, "rouge1_fmeasure": 0.18235828185303843, "rouge1_fmeasure_stderr": 0.0022330522398271444, "rouge1_precision": 0.16818607657817242, "rouge1_precision_stderr": 0.002571340600589284, "rouge1_recall": 0.2618591281994728, "rouge1_recall_stderr": 0.0033215062620397654, "rouge2_fmeasure": 0.04352751746326409, "rouge2_fmeasure_stderr": 0.000986420480404656, "rouge2_precision": 0.039906055320126824, "rouge2_precision_stderr": 0.0011176532929901056, "rouge2_recall": 0.06504905939729995, "rouge2_recall_stderr": 0.0016338067696316037, "rougeL_fmeasure": 0.12850084937451378, "rougeL_fmeasure_stderr": 0.0015346657748469231, "rougeL_precision": 0.11910589220232795, "rougeL_precision_stderr": 0.001919539273889145, "rougeL_recall": 0.18906464586111238, "rougeL_recall_stderr": 0.002550157025960148, "rougeLsum_fmeasure": 0.17275184259901263, "rougeLsum_fmeasure_stderr": 0.0021141809407287217, "rougeLsum_precision": 0.1592819269786109, "rougeLsum_precision_stderr": 0.0024458047067507224, "rougeLsum_recall": 0.24869687852492026, "rougeLsum_recall_stderr": 0.00317986279337076}}, "4": {"tldr_en": {"bleu": 0.6681174311441812, "bleu_stderr": 0.044021606714169687, "rouge1_fmeasure": 0.0587450878634487, "rouge1_fmeasure_stderr": 0.001987849748246455, "rouge1_precision": 0.05651284854563454, "rouge1_precision_stderr": 0.002164889280586853, "rouge1_recall": 0.08771427792659671, "rouge1_recall_stderr": 0.003046668032178251, "rouge2_fmeasure": 0.014698951085450463, "rouge2_fmeasure_stderr": 0.0007071025730658953, "rouge2_precision": 0.01373423070578025, "rouge2_precision_stderr": 0.0007944242550989159, "rouge2_recall": 0.023902876010557156, "rouge2_recall_stderr": 0.0012871698398313374, "rougeL_fmeasure": 0.04283521055248602, "rougeL_fmeasure_stderr": 0.001420656788711465, "rougeL_precision": 0.041455127649633716, "rougeL_precision_stderr": 0.001604158922614608, "rougeL_recall": 0.06565381938564946, "rougeL_recall_stderr": 0.002338953948684061, "rougeLsum_fmeasure": 0.05516099966060376, "rougeLsum_fmeasure_stderr": 0.0018676755144161564, "rougeLsum_precision": 0.053065857077570854, "rougeLsum_precision_stderr": 0.0020380922902812288, "rougeLsum_recall": 0.08260798722441874, "rougeLsum_recall_stderr": 0.002886226452230474}}, "5": {"tldr_en": {"bleu": 8.138411373037316e-07, "bleu_stderr": 1.7429365149014433e-06, "rouge1_fmeasure": 0.009157992753346606, "rouge1_fmeasure_stderr": 0.000856510665924713, "rouge1_precision": 0.008744552174562775, "rouge1_precision_stderr": 0.0008820534430831269, "rouge1_recall": 0.014063870074080958, "rouge1_recall_stderr": 0.0013626675606431993, "rouge2_fmeasure": 0.002073109168721178, "rouge2_fmeasure_stderr": 0.00025579423912383813, "rouge2_precision": 0.001867598511771763, "rouge2_precision_stderr": 0.0002589055645534681, "rouge2_recall": 0.0037526556188222397, "rouge2_recall_stderr": 0.0005665830894881638, "rougeL_fmeasure": 0.006597140236099842, "rougeL_fmeasure_stderr": 0.0006134117940431641, "rougeL_precision": 0.006278467313776165, "rougeL_precision_stderr": 0.0006318463479225579, "rougeL_recall": 0.010544788230721835, "rougeL_recall_stderr": 0.0010639298188298192, "rougeLsum_fmeasure": 0.008602211224573027, "rougeLsum_fmeasure_stderr": 0.0007971656716790751, "rougeLsum_precision": 0.00823467650006455, "rougeLsum_precision_stderr": 0.000829000231276819, "rougeLsum_recall": 0.013318449806788014, "rougeLsum_recall_stderr": 0.0012936214066513207}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.1675172437507659, "bleu_stderr": 0.027600643399440884, "rouge1_fmeasure": 0.024088126860004336, "rouge1_fmeasure_stderr": 0.0006566077823850723, "rouge1_precision": 0.028701737940994714, "rouge1_precision_stderr": 0.0010195754285503126, "rouge1_recall": 0.02982766113489123, "rouge1_recall_stderr": 0.0008471594561940423, "rouge2_fmeasure": 0.001114996015022125, "rouge2_fmeasure_stderr": 0.00018328608469909375, "rouge2_precision": 0.0015626210476206773, "rouge2_precision_stderr": 0.0003728911300811695, "rouge2_recall": 0.0014347430615192808, "rouge2_recall_stderr": 0.00023769294132393532, "rougeL_fmeasure": 0.023944632138488853, "rougeL_fmeasure_stderr": 0.0006463198681447033, "rougeL_precision": 0.028417386089142856, "rougeL_precision_stderr": 0.0009947648295870358, "rougeL_recall": 0.029685273969587405, "rougeL_recall_stderr": 0.00083480831932383, "rougeLsum_fmeasure": 0.022309991073370505, "rougeLsum_fmeasure_stderr": 0.0005760998972707722, "rougeLsum_precision": 0.02728685295948081, "rougeLsum_precision_stderr": 0.0009885875546693994, "rougeLsum_recall": 0.02727004908623676, "rougeLsum_recall_stderr": 0.0007114664570306672}}, "1": {"generate_text_restaurant": {"bleu": 5.8311861735166675, "bleu_stderr": 0.06388138684732117, "rouge1_fmeasure": 0.3211813837867464, "rouge1_fmeasure_stderr": 0.002018437643374035, "rouge1_precision": 0.2601081962246765, "rouge1_precision_stderr": 0.0021349255671127927, "rouge1_recall": 0.4686781469942304, "rouge1_recall_stderr": 0.002825480648369596, "rouge2_fmeasure": 0.13429472916574672, "rouge2_fmeasure_stderr": 0.0014217210935365725, "rouge2_precision": 0.10907314684339463, "rouge2_precision_stderr": 0.0013682678699487114, "rouge2_recall": 0.1992897827723483, "rouge2_recall_stderr": 0.0021146365213576476, "rougeL_fmeasure": 0.25612256110358006, "rougeL_fmeasure_stderr": 0.0014268806319662325, "rougeL_precision": 0.2056686183422757, "rougeL_precision_stderr": 0.0015071237696962308, "rougeL_recall": 0.37982651296992087, "rougeL_recall_stderr": 0.0023920704305803887, "rougeLsum_fmeasure": 0.2605843619987691, "rougeLsum_fmeasure_stderr": 0.0019094298729220246, "rougeLsum_precision": 0.2117404050410904, "rougeLsum_precision_stderr": 0.0019554275881996357, "rougeLsum_recall": 0.379262023761823, "rougeLsum_recall_stderr": 0.0026640542361163967}}, "2": {"generate_text_restaurant": {"bleu": 8.283032143741265, "bleu_stderr": 0.12324083863995262, "rouge1_fmeasure": 0.386855872645816, "rouge1_fmeasure_stderr": 0.0021481774383948936, "rouge1_precision": 0.3538185229938564, "rouge1_precision_stderr": 0.0026093673205933284, "rouge1_recall": 0.47654923761453416, "rouge1_recall_stderr": 0.002742670643307486, "rouge2_fmeasure": 0.17540917102638537, "rouge2_fmeasure_stderr": 0.001676199795834029, "rouge2_precision": 0.16058763409752866, "rouge2_precision_stderr": 0.0017929892307175898, "rouge2_recall": 0.2186758841759895, "rouge2_recall_stderr": 0.0021644223522511653, "rougeL_fmeasure": 0.29032162387497273, "rougeL_fmeasure_stderr": 0.0016448424366105137, "rougeL_precision": 0.2634180709043917, "rougeL_precision_stderr": 0.0019363360461210728, "rougeL_recall": 0.3633286847976612, "rougeL_recall_stderr": 0.002418760103075258, "rougeLsum_fmeasure": 0.3215658810211159, "rougeLsum_fmeasure_stderr": 0.0020865232232060243, "rougeLsum_precision": 0.2943420649549215, "rougeLsum_precision_stderr": 0.0024049285177976626, "rougeLsum_recall": 0.396021431067016, "rougeLsum_recall_stderr": 0.0026539511959613863}}, "3": {"generate_text_restaurant": {"bleu": 10.390741038004455, "bleu_stderr": 0.16864807456665598, "rouge1_fmeasure": 0.4265219434574998, "rouge1_fmeasure_stderr": 0.00200022371619217, "rouge1_precision": 0.4170328570049967, "rouge1_precision_stderr": 0.002406804998864227, "rouge1_recall": 0.4755330496164165, "rouge1_recall_stderr": 0.0027356944570645627, "rouge2_fmeasure": 0.19794956049387477, "rouge2_fmeasure_stderr": 0.0017257612820525093, "rouge2_precision": 0.19311277752964032, "rouge2_precision_stderr": 0.0018435166685741753, "rouge2_recall": 0.2234558315509727, "rouge2_recall_stderr": 0.00217671447753381, "rougeL_fmeasure": 0.30937209897451035, "rougeL_fmeasure_stderr": 0.0017399874604572011, "rougeL_precision": 0.3018404766668879, "rougeL_precision_stderr": 0.0019954675660840008, "rougeL_recall": 0.34733784496925735, "rougeL_recall_stderr": 0.002410859115489849, "rougeLsum_fmeasure": 0.35577353404582346, "rougeLsum_fmeasure_stderr": 0.002018349673780584, "rougeLsum_precision": 0.34804273669533864, "rougeLsum_precision_stderr": 0.002320309559338597, "rougeLsum_recall": 0.39674483976023345, "rougeLsum_recall_stderr": 0.002636069670789193}}, "4": {"generate_text_restaurant": {"bleu": 11.329341641449828, "bleu_stderr": 0.17960715134758612, "rouge1_fmeasure": 0.4404396724578438, "rouge1_fmeasure_stderr": 0.0019501997658908016, "rouge1_precision": 0.43540537755676456, "rouge1_precision_stderr": 0.0023276114946809037, "rouge1_recall": 0.4809839727831288, "rouge1_recall_stderr": 0.0026909272047918853, "rouge2_fmeasure": 0.20632107444632167, "rouge2_fmeasure_stderr": 0.0017999776378623833, "rouge2_precision": 0.20335070986022913, "rouge2_precision_stderr": 0.0018630249723704795, "rouge2_recall": 0.2277953855378017, "rouge2_recall_stderr": 0.002231447665036813, "rougeL_fmeasure": 0.3146665867607543, "rougeL_fmeasure_stderr": 0.001793539263470212, "rougeL_precision": 0.31052804922692206, "rougeL_precision_stderr": 0.0019906703253583058, "rougeL_recall": 0.34538706806048836, "rougeL_recall_stderr": 0.0024265779869813795, "rougeLsum_fmeasure": 0.3681598682721843, "rougeLsum_fmeasure_stderr": 0.0020492701030304073, "rougeLsum_precision": 0.36360422785078816, "rougeLsum_precision_stderr": 0.002278768446357271, "rougeLsum_recall": 0.40267975309659465, "rougeLsum_recall_stderr": 0.002691014459855529}}, "5": {"generate_text_restaurant": {"bleu": 11.285255750718802, "bleu_stderr": 0.18888005349774178, "rouge1_fmeasure": 0.4415151553259178, "rouge1_fmeasure_stderr": 0.0018967405556148942, "rouge1_precision": 0.4365680288096621, "rouge1_precision_stderr": 0.0023563761935207865, "rouge1_recall": 0.48375423799463574, "rouge1_recall_stderr": 0.002636920734911192, "rouge2_fmeasure": 0.20666903463506434, "rouge2_fmeasure_stderr": 0.0017287346978560685, "rouge2_precision": 0.2041613364300487, "rouge2_precision_stderr": 0.0018478653045982886, "rouge2_recall": 0.2286562929012743, "rouge2_recall_stderr": 0.0021444262626903934, "rougeL_fmeasure": 0.31404489497308175, "rougeL_fmeasure_stderr": 0.0017489740232863793, "rougeL_precision": 0.3097317177052037, "rougeL_precision_stderr": 0.0019875430217516188, "rougeL_recall": 0.3463247383671021, "rougeL_recall_stderr": 0.0024086631359451073, "rougeLsum_fmeasure": 0.3693839714597318, "rougeLsum_fmeasure_stderr": 0.001956221447002133, "rougeLsum_precision": 0.36511129451802127, "rougeLsum_precision_stderr": 0.002275854131726726, "rougeLsum_recall": 0.40524349161687195, "rougeLsum_recall_stderr": 0.002592210635732001}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9920111933130942, "bleu_stderr": 0.09083971543338792, "rouge1_fmeasure": 0.21238019045835924, "rouge1_fmeasure_stderr": 0.0026416842263647295, "rouge1_precision": 0.16661991011320432, "rouge1_precision_stderr": 0.0024239320503688508, "rouge1_recall": 0.3334279529769119, "rouge1_recall_stderr": 0.004505784466073162, "rouge2_fmeasure": 0.0474651150579007, "rouge2_fmeasure_stderr": 0.0016508808672473486, "rouge2_precision": 0.036458947908946444, "rouge2_precision_stderr": 0.0013971876206545527, "rouge2_recall": 0.07797973705513334, "rouge2_recall_stderr": 0.0027575216878305796, "rougeL_fmeasure": 0.1574172507962458, "rougeL_fmeasure_stderr": 0.0019726023490667156, "rougeL_precision": 0.12350643841301835, "rougeL_precision_stderr": 0.0018449748802517576, "rougeL_recall": 0.24834266376447475, "rougeL_recall_stderr": 0.0034694940835095387, "rougeLsum_fmeasure": 0.16564203891769072, "rougeLsum_fmeasure_stderr": 0.0021922015182424493, "rougeLsum_precision": 0.12930576998616114, "rougeLsum_precision_stderr": 0.001943629127681205, "rougeLsum_recall": 0.2628771358874318, "rougeLsum_recall_stderr": 0.003928661472441744}}, "1": {"article_DOC_summary": {"bleu": 1.3911048134042985, "bleu_stderr": 0.10897719788542379, "rouge1_fmeasure": 0.17238543114114188, "rouge1_fmeasure_stderr": 0.0024273582171495467, "rouge1_precision": 0.12220764132281824, "rouge1_precision_stderr": 0.0017962553489656822, "rouge1_recall": 0.3046016264780442, "rouge1_recall_stderr": 0.004195472170007382, "rouge2_fmeasure": 0.03433659588366439, "rouge2_fmeasure_stderr": 0.0013916698462749025, "rouge2_precision": 0.02404240814314951, "rouge2_precision_stderr": 0.0009715966898615973, "rouge2_recall": 0.06279570551371068, "rouge2_recall_stderr": 0.0026393784630938297, "rougeL_fmeasure": 0.13411490255282393, "rougeL_fmeasure_stderr": 0.001857806891825946, "rougeL_precision": 0.0948627092015581, "rougeL_precision_stderr": 0.0013564109394382858, "rougeL_recall": 0.23873232992980792, "rougeL_recall_stderr": 0.0033732063257411065, "rougeLsum_fmeasure": 0.1377971067717085, "rougeLsum_fmeasure_stderr": 0.0019961553438677753, "rougeLsum_precision": 0.09744320598123982, "rougeLsum_precision_stderr": 0.0014533168661689533, "rougeLsum_recall": 0.24526431304855986, "rougeLsum_recall_stderr": 0.0036135020193240157}}, "2": {"article_DOC_summary": {"bleu": 1.5131027243134945, "bleu_stderr": 0.09818604218520245, "rouge1_fmeasure": 0.1796260802555828, "rouge1_fmeasure_stderr": 0.0024295120173709354, "rouge1_precision": 0.12742301658282565, "rouge1_precision_stderr": 0.0018019437038001125, "rouge1_recall": 0.31650559447584714, "rouge1_recall_stderr": 0.0041632693268193, "rouge2_fmeasure": 0.03828129617404584, "rouge2_fmeasure_stderr": 0.0014588717099132717, "rouge2_precision": 0.026873267786760833, "rouge2_precision_stderr": 0.0010274689542246274, "rouge2_recall": 0.06937748072287012, "rouge2_recall_stderr": 0.002720274634345644, "rougeL_fmeasure": 0.14088593992593365, "rougeL_fmeasure_stderr": 0.0018545171064126879, "rougeL_precision": 0.09972086437810422, "rougeL_precision_stderr": 0.0013615528035081867, "rougeL_recall": 0.25006918430896163, "rougeL_recall_stderr": 0.0033224978911440477, "rougeLsum_fmeasure": 0.14453096950758154, "rougeLsum_fmeasure_stderr": 0.0020492721061134787, "rougeLsum_precision": 0.10228150708865971, "rougeLsum_precision_stderr": 0.001498292749384224, "rougeLsum_recall": 0.2564975734365291, "rougeLsum_recall_stderr": 0.003645659171226953}}, "3": {"article_DOC_summary": {"bleu": 1.5589813395856684, "bleu_stderr": 0.10558852439347298, "rouge1_fmeasure": 0.17393387340194993, "rouge1_fmeasure_stderr": 0.0026232973050223565, "rouge1_precision": 0.12560830490871291, "rouge1_precision_stderr": 0.0019805809749753123, "rouge1_recall": 0.30202505876515096, "rouge1_recall_stderr": 0.004562458827775563, "rouge2_fmeasure": 0.03719404523373464, "rouge2_fmeasure_stderr": 0.0014264985251086694, "rouge2_precision": 0.02632965637304347, "rouge2_precision_stderr": 0.0010168079119372074, "rouge2_recall": 0.06654262996214515, "rouge2_recall_stderr": 0.002628246685221607, "rougeL_fmeasure": 0.1367280753200056, "rougeL_fmeasure_stderr": 0.0020569662559889986, "rougeL_precision": 0.09872111817099473, "rougeL_precision_stderr": 0.0015538240912434773, "rougeL_recall": 0.23817398946563523, "rougeL_recall_stderr": 0.0036574265400714056, "rougeLsum_fmeasure": 0.13995698919988145, "rougeLsum_fmeasure_stderr": 0.0021562163365348155, "rougeLsum_precision": 0.10088738488275732, "rougeLsum_precision_stderr": 0.0016123154477431468, "rougeLsum_recall": 0.24479219612186495, "rougeLsum_recall_stderr": 0.0038858502767361853}}, "4": {"article_DOC_summary": {"bleu": 0.7800465095373856, "bleu_stderr": 0.110140635740535, "rouge1_fmeasure": 0.04714278099581509, "rouge1_fmeasure_stderr": 0.002671918175318051, "rouge1_precision": 0.039875289966317076, "rouge1_precision_stderr": 0.0024501045936788945, "rouge1_recall": 0.07430021940128062, "rouge1_recall_stderr": 0.004312735443842743, "rouge2_fmeasure": 0.009927038748098277, "rouge2_fmeasure_stderr": 0.0009411928436840111, "rouge2_precision": 0.00812289150475277, "rouge2_precision_stderr": 0.0009151347143203979, "rouge2_recall": 0.016372857345750558, "rouge2_recall_stderr": 0.0015796426861189893, "rougeL_fmeasure": 0.03725830963733278, "rougeL_fmeasure_stderr": 0.0020970987843492487, "rougeL_precision": 0.031900774626537044, "rougeL_precision_stderr": 0.0020006091032227793, "rougeL_recall": 0.058775527889088235, "rougeL_recall_stderr": 0.0033989669828951557, "rougeLsum_fmeasure": 0.03901707398161308, "rougeLsum_fmeasure_stderr": 0.0022151386053289395, "rougeLsum_precision": 0.03342763825361942, "rougeLsum_precision_stderr": 0.002105461480850086, "rougeLsum_recall": 0.06157534488712963, "rougeLsum_recall_stderr": 0.003610155719272957}}, "5": {"article_DOC_summary": {"bleu": 6.009065394749386e-37, "bleu_stderr": 1.386737757665829e-31, "rouge1_fmeasure": 0.0030037230280197226, "rouge1_fmeasure_stderr": 0.0008566665942648847, "rouge1_precision": 0.0033882933420727682, "rouge1_precision_stderr": 0.000994288954345408, "rouge1_recall": 0.002780937524962328, "rouge1_recall_stderr": 0.0007843603020955971, "rouge2_fmeasure": 0.000983708450751863, "rouge2_fmeasure_stderr": 0.0004420715282406731, "rouge2_precision": 0.001120152102248019, "rouge2_precision_stderr": 0.0004966314099881747, "rouge2_recall": 0.0009065462839047746, "rouge2_recall_stderr": 0.0004161603469287444, "rougeL_fmeasure": 0.002337297608841716, "rougeL_fmeasure_stderr": 0.0007032854409325233, "rougeL_precision": 0.0026091624396139845, "rougeL_precision_stderr": 0.0007993813642794688, "rougeL_recall": 0.002187358356350872, "rougeL_recall_stderr": 0.0006548878815934947, "rougeLsum_fmeasure": 0.002540252346941857, "rougeLsum_fmeasure_stderr": 0.0007562653583340974, "rougeLsum_precision": 0.0028561523161821077, "rougeLsum_precision_stderr": 0.000870682583927747, "rougeLsum_recall": 0.002361092427287286, "rougeLsum_recall_stderr": 0.0006961516518425747}}}} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..a7937387d068f2e26f46875dc0e6b88642dbb410 --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738859,0 +anli_r2,acc,0.338,0.014965960710224482,0 +anli_r3,acc,0.3416666666666667,0.013696658778002519,0 +arc_challenge,acc,0.2841296928327645,0.013179442447653886,0 +arc_challenge,acc_norm,0.3003412969283277,0.013395909309957007,0 +arc_easy,acc,0.6031144781144782,0.010039236800583206,0 +arc_easy,acc_norm,0.5353535353535354,0.01023410454341143,0 +boolq,acc,0.5571865443425077,0.008687668766930827,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.22212270488132557,,1 +copa,acc,0.74,0.044084400227680794,0 +hellaswag,acc,0.48446524596693885,0.004987372476207027,0 +hellaswag,acc_norm,0.6316470822545309,0.004813719952829966,0 +piqa,acc,0.7627856365614799,0.009924694933586373,0 +piqa,acc_norm,0.7682263329706203,0.009845143772794052,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.858,0.011043457699378237,0 +sciq,acc_norm,0.766,0.01339490288966001,0 +storycloze_2016,acc,0.711918760021379,0.010472537019822575,0 +winogrande,acc,0.5951065509076559,0.01379592700312494,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-01_0shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-01_0shots_backup.json deleted file mode 100644 index b9289ba3c5da01629517bfcd7e976af53abfcbfd..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-01_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738859 - }, - "anli_r2": { - "acc": 0.338, - "acc_stderr": 0.014965960710224482 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002519 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.22212270488132557 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.044084400227680794 - }, - "hellaswag": { - "acc": 0.48446524596693885, - "acc_stderr": 0.004987372476207027, - "acc_norm": 0.6316470822545309, - "acc_norm_stderr": 0.004813719952829966 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5951065509076559, - "acc_stderr": 0.01379592700312494 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822575 - }, - "boolq": { - "acc": 0.5571865443425077, - "acc_stderr": 0.008687668766930827 - }, - "arc_easy": { - "acc": 0.6031144781144782, - "acc_stderr": 0.010039236800583206, - "acc_norm": 0.5353535353535354, - "acc_norm_stderr": 0.01023410454341143 - }, - "arc_challenge": { - "acc": 0.2841296928327645, - "acc_stderr": 0.013179442447653886, - "acc_norm": 0.3003412969283277, - "acc_norm_stderr": 0.013395909309957007 - }, - "sciq": { - "acc": 0.858, - "acc_stderr": 0.011043457699378237, - "acc_norm": 0.766, - "acc_norm_stderr": 0.01339490288966001 - }, - "piqa": { - "acc": 0.7627856365614799, - "acc_stderr": 0.009924694933586373, - "acc_norm": 0.7682263329706203, - "acc_norm_stderr": 0.009845143772794052 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..608272bc7af6500e0a4e853639f7a418b2088bf7 --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.315,0.014696631960792498,0 +anli_r2,acc,0.339,0.01497675877162034,0 +anli_r3,acc,0.3541666666666667,0.01381193349957096,0 +arc_challenge,acc,0.31143344709897613,0.013532472099850942,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623503,0 +arc_easy,acc,0.6186868686868687,0.009966542497171018,0 +arc_easy,acc_norm,0.571969696969697,0.010152943316426265,0 +boolq,acc,0.5718654434250765,0.008654253415781077,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3333333333333333,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.47759410476000796,0.004984768912326931,0 +hellaswag,acc_norm,0.6308504282015535,0.004815882719278391,0 +piqa,acc,0.7568008705114254,0.010009611953858914,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267315,0 +rte,acc,0.5523465703971119,0.029931070362939533,0 +sciq,acc,0.878,0.010354864712936706,0 +sciq,acc_norm,0.852,0.01123486636423524,0 +storycloze_2016,acc,0.711918760021379,0.01047253701982257,0 +winogrande,acc,0.5785319652722968,0.013878072377497597,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json deleted file mode 100644 index 1c3c3f205400d689e82a5b15c850ba6870fdde42..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.315, - "acc_stderr": 0.014696631960792498 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.01497675877162034 - }, - "anli_r3": { - "acc": 0.3541666666666667, - "acc_stderr": 0.01381193349957096 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3333333333333333 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.47759410476000796, - "acc_stderr": 0.004984768912326931, - "acc_norm": 0.6308504282015535, - "acc_norm_stderr": 0.004815882719278391 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939533 - }, - "winogrande": { - "acc": 0.5785319652722968, - "acc_stderr": 0.013878072377497597 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.01047253701982257 - }, - "boolq": { - "acc": 0.5718654434250765, - "acc_stderr": 0.008654253415781077 - }, - "arc_easy": { - "acc": 0.6186868686868687, - "acc_stderr": 0.009966542497171018, - "acc_norm": 0.571969696969697, - "acc_norm_stderr": 0.010152943316426265 - }, - "arc_challenge": { - "acc": 0.31143344709897613, - "acc_stderr": 0.013532472099850942, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623503 - }, - "sciq": { - "acc": 0.878, - "acc_stderr": 0.010354864712936706, - "acc_norm": 0.852, - "acc_norm_stderr": 0.01123486636423524 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858914, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267315 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3370cabec6710527e3cb5e14a4beec8a494ed0ea --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.315,0.0146966319607925,0 +anli_r2,acc,0.326,0.01483050720454104,0 +anli_r3,acc,0.3466666666666667,0.013744022550571944,0 +arc_challenge,acc,0.2960750853242321,0.013340916085246254,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053059,0 +arc_easy,acc,0.6216329966329966,0.009951575683331949,0 +arc_easy,acc_norm,0.5921717171717171,0.010083950240041223,0 +boolq,acc,0.5715596330275229,0.008655028561519765,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2606516290726817,,1 +copa,acc,0.74,0.044084400227680794,0 +hellaswag,acc,0.4737104162517427,0.004982879340691406,0 +hellaswag,acc_norm,0.6329416450906195,0.004810175357870948,0 +piqa,acc,0.7568008705114254,0.010009611953858917,0 +piqa,acc_norm,0.764961915125136,0.009893146688805306,0 +rte,acc,0.5415162454873647,0.029992535385373317,0 +sciq,acc,0.894,0.009739551265785133,0 +sciq,acc_norm,0.871,0.010605256784796577,0 +storycloze_2016,acc,0.7258150721539284,0.010316062787590001,0 +winogrande,acc,0.5966850828729282,0.013787257285896241,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json deleted file mode 100644 index 0abd296a9166c031990a450ebbc74ee6ef783a91..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-02_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.315, - "acc_stderr": 0.0146966319607925 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.01483050720454104 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571944 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2606516290726817 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.044084400227680794 - }, - "hellaswag": { - "acc": 0.4737104162517427, - "acc_stderr": 0.004982879340691406, - "acc_norm": 0.6329416450906195, - "acc_norm_stderr": 0.004810175357870948 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373317 - }, - "winogrande": { - "acc": 0.5966850828729282, - "acc_stderr": 0.013787257285896241 - }, - "storycloze_2016": { - "acc": 0.7258150721539284, - "acc_stderr": 0.010316062787590001 - }, - "boolq": { - "acc": 0.5715596330275229, - "acc_stderr": 0.008655028561519765 - }, - "arc_easy": { - "acc": 0.6216329966329966, - "acc_stderr": 0.009951575683331949, - "acc_norm": 0.5921717171717171, - "acc_norm_stderr": 0.010083950240041223 - }, - "arc_challenge": { - "acc": 0.2960750853242321, - "acc_stderr": 0.013340916085246254, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053059 - }, - "sciq": { - "acc": 0.894, - "acc_stderr": 0.009739551265785133, - "acc_norm": 0.871, - "acc_norm_stderr": 0.010605256784796577 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858917, - "acc_norm": 0.764961915125136, - "acc_norm_stderr": 0.009893146688805306 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..f0771f1bf2ac4d5e7c20409dc80c39d920d1d93d --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.319,0.014746404865473472,0 +anli_r2,acc,0.345,0.015039986742055238,0 +anli_r3,acc,0.355,0.013819249004047303,0 +arc_challenge,acc,0.2883959044368601,0.013238394422428182,0 +arc_challenge,acc_norm,0.318259385665529,0.013611993916971453,0 +arc_easy,acc,0.6308922558922558,0.00990198741024273,0 +arc_easy,acc_norm,0.6026936026936027,0.010041053078884286,0 +boolq,acc,0.5865443425076453,0.008613059239942641,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3181222134801993,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.47779326827325236,0.004984857671187105,0 +hellaswag,acc_norm,0.6338378809002191,0.004807699539973427,0 +piqa,acc,0.7584330794341676,0.009986718001804463,0 +piqa,acc_norm,0.764961915125136,0.009893146688805308,0 +rte,acc,0.5595667870036101,0.029882123363118723,0 +sciq,acc,0.903,0.009363689373248114,0 +sciq,acc_norm,0.893,0.009779910359847165,0 +storycloze_2016,acc,0.7231427044361304,0.010347112890276929,0 +winogrande,acc,0.5651144435674822,0.013932814110418024,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json deleted file mode 100644 index 374a97008dca04c73fff269d6dee93ad6e7e20a0..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.319, - "acc_stderr": 0.014746404865473472 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055238 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047303 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3181222134801993 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.47779326827325236, - "acc_stderr": 0.004984857671187105, - "acc_norm": 0.6338378809002191, - "acc_norm_stderr": 0.004807699539973427 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118723 - }, - "winogrande": { - "acc": 0.5651144435674822, - "acc_stderr": 0.013932814110418024 - }, - "storycloze_2016": { - "acc": 0.7231427044361304, - "acc_stderr": 0.010347112890276929 - }, - "boolq": { - "acc": 0.5865443425076453, - "acc_stderr": 0.008613059239942641 - }, - "arc_easy": { - "acc": 0.6308922558922558, - "acc_stderr": 0.00990198741024273, - "acc_norm": 0.6026936026936027, - "acc_norm_stderr": 0.010041053078884286 - }, - "arc_challenge": { - "acc": 0.2883959044368601, - "acc_stderr": 0.013238394422428182, - "acc_norm": 0.318259385665529, - "acc_norm_stderr": 0.013611993916971453 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.009363689373248114, - "acc_norm": 0.893, - "acc_norm_stderr": 0.009779910359847165 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804463, - "acc_norm": 0.764961915125136, - "acc_norm_stderr": 0.009893146688805308 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..076d91b4345cffcd8c68c2038b64b8919e0b7edf --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.332,0.014899597242811492,0 +anli_r2,acc,0.368,0.0152580735615218,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.2960750853242321,0.013340916085246263,0 +arc_challenge,acc_norm,0.3250853242320819,0.013688147309729124,0 +arc_easy,acc,0.6304713804713805,0.009904325878447319,0 +arc_easy,acc_norm,0.6085858585858586,0.010014917532627819,0 +boolq,acc,0.5691131498470948,0.008661108320775374,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.33712121212121215,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4765982871937861,0.004984313205791442,0 +hellaswag,acc_norm,0.6384186417048396,0.0047947648436852865,0 +piqa,acc,0.7589771490750816,0.009979042717267315,0 +piqa,acc_norm,0.764417845484222,0.009901067586473885,0 +rte,acc,0.47653429602888087,0.03006330041190266,0 +sciq,acc,0.908,0.009144376393151108,0 +sciq,acc_norm,0.9,0.009491579957525044,0 +storycloze_2016,acc,0.7300908605024051,0.01026541350322146,0 +winogrande,acc,0.580110497237569,0.013870943986310395,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json deleted file mode 100644 index d552572c4d050c4470d865e408b396f760319ef4..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.332, - "acc_stderr": 0.014899597242811492 - }, - "anli_r2": { - "acc": 0.368, - "acc_stderr": 0.0152580735615218 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.33712121212121215 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4765982871937861, - "acc_stderr": 0.004984313205791442, - "acc_norm": 0.6384186417048396, - "acc_norm_stderr": 0.0047947648436852865 - }, - "rte": { - "acc": 0.47653429602888087, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.580110497237569, - "acc_stderr": 0.013870943986310395 - }, - "storycloze_2016": { - "acc": 0.7300908605024051, - "acc_stderr": 0.01026541350322146 - }, - "boolq": { - "acc": 0.5691131498470948, - "acc_stderr": 0.008661108320775374 - }, - "arc_easy": { - "acc": 0.6304713804713805, - "acc_stderr": 0.009904325878447319, - "acc_norm": 0.6085858585858586, - "acc_norm_stderr": 0.010014917532627819 - }, - "arc_challenge": { - "acc": 0.2960750853242321, - "acc_stderr": 0.013340916085246263, - "acc_norm": 0.3250853242320819, - "acc_norm_stderr": 0.013688147309729124 - }, - "sciq": { - "acc": 0.908, - "acc_stderr": 0.009144376393151108, - "acc_norm": 0.9, - "acc_norm_stderr": 0.009491579957525044 - }, - "piqa": { - "acc": 0.7589771490750816, - "acc_stderr": 0.009979042717267315, - "acc_norm": 0.764417845484222, - "acc_norm_stderr": 0.009901067586473885 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5.csv b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..6ce4ed9079251313da3b69dfac28e32cab788025 --- /dev/null +++ b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.343,0.015019206922356951,0 +anli_r2,acc,0.332,0.014899597242811478,0 +anli_r3,acc,0.33166666666666667,0.01359683672948518,0 +arc_challenge,acc,0.3046075085324232,0.013449522109932487,0 +arc_challenge,acc_norm,0.33361774744027306,0.01377868705417654,0 +arc_easy,acc,0.6308922558922558,0.009901987410242733,0 +arc_easy,acc_norm,0.6077441077441077,0.010018744689650043,0 +boolq,acc,0.5896024464831804,0.008603488048617521,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.22512077294685992,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.477096195976897,0.00498454354093234,0 +hellaswag,acc_norm,0.6394144592710616,0.004791890625834213,0 +piqa,acc,0.7584330794341676,0.009986718001804456,0 +piqa,acc_norm,0.766050054406964,0.009877236895137434,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.916,0.008776162089491132,0 +sciq,acc_norm,0.909,0.009099549538400243,0 +storycloze_2016,acc,0.7247461250668092,0.010328538400500572,0 +winogrande,acc,0.590370955011839,0.013821049109655472,0 diff --git a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json b/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json deleted file mode 100644 index 61f2e179efa9aee0aba89444f6db0358b079e55c..0000000000000000000000000000000000000000 --- a/4b284b21bc4seed4/evaluation/rankeval/4b284b21bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.343, - "acc_stderr": 0.015019206922356951 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811478 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.01359683672948518 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.22512077294685992 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.477096195976897, - "acc_stderr": 0.00498454354093234, - "acc_norm": 0.6394144592710616, - "acc_norm_stderr": 0.004791890625834213 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.590370955011839, - "acc_stderr": 0.013821049109655472 - }, - "storycloze_2016": { - "acc": 0.7247461250668092, - "acc_stderr": 0.010328538400500572 - }, - "boolq": { - "acc": 0.5896024464831804, - "acc_stderr": 0.008603488048617521 - }, - "arc_easy": { - "acc": 0.6308922558922558, - "acc_stderr": 0.009901987410242733, - "acc_norm": 0.6077441077441077, - "acc_norm_stderr": 0.010018744689650043 - }, - "arc_challenge": { - "acc": 0.3046075085324232, - "acc_stderr": 0.013449522109932487, - "acc_norm": 0.33361774744027306, - "acc_norm_stderr": 0.01377868705417654 - }, - "sciq": { - "acc": 0.916, - "acc_stderr": 0.008776162089491132, - "acc_norm": 0.909, - "acc_norm_stderr": 0.009099549538400243 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804456, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.009877236895137434 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/generation/merged.csv b/4b284b28bc4seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d4445d8afb3bf07f86c2123753b03d286bb32d6d --- /dev/null +++ b/4b284b28bc4seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0024384647317544604 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0024384647317544604 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2090655371625227 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2090655371625227 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23283198535704958 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23283198535704958 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24214353888062579 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24214353888062579 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.2391776138295886 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.2391776138295886 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24094008936533884 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24094008936533884 +e2e_nlg_cleaned,5,average,multiple,0.19443287155448 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.043870150801107066 +gem_xsum,0,median,rouge2_fmeasure,0.043870150801107066 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04075132217381377 +gem_xsum,1,median,rouge2_fmeasure,0.04075132217381377 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.038998510414389495 +gem_xsum,2,median,rouge2_fmeasure,0.038998510414389495 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03728687717351484 +gem_xsum,3,median,rouge2_fmeasure,0.03728687717351484 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009890919687627312 +gem_xsum,4,median,rouge2_fmeasure,0.009890919687627312 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0002457908682882255 +gem_xsum,5,median,rouge2_fmeasure,0.0002457908682882255 +gem_xsum,5,average,multiple,0.02850726185312345 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05551128171851163 +web_nlg_en,0,median,rouge2_fmeasure,0.05551128171851163 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.061270467324873235 +web_nlg_en,1,median,rouge2_fmeasure,0.061270467324873235 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.06151955178238632 +web_nlg_en,2,median,rouge2_fmeasure,0.06151955178238632 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05835931677377586 +web_nlg_en,3,median,rouge2_fmeasure,0.05835931677377586 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05863215249774496 +web_nlg_en,4,median,rouge2_fmeasure,0.05863215249774496 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.058674755789965206 +web_nlg_en,5,median,rouge2_fmeasure,0.058674755789965206 +web_nlg_en,5,average,multiple,0.0589945876478762 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033403065447518296 +wiki_lingua_en,0,median,rouge2_fmeasure,0.033403065447518296 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.057255731342826154 +wiki_lingua_en,1,median,rouge2_fmeasure,0.057255731342826154 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.054139172220906376 +wiki_lingua_en,2,median,rouge2_fmeasure,0.054139172220906376 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04512193425646708 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04512193425646708 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.012956061411719485 +wiki_lingua_en,4,median,rouge2_fmeasure,0.012956061411719485 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0022561594203614308 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0022561594203614308 +wiki_lingua_en,5,average,multiple,0.03418868734996647 diff --git a/4b284b28bc4seed1/evaluation/generation/merged.json b/4b284b28bc4seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..99e8003c4bf505428d547956a976bfb92e9434d4 --- /dev/null +++ b/4b284b28bc4seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.37735446471481876, "bleu_stderr": 0.03627756616654335, "rouge1_fmeasure": 0.11660803455743125, "rouge1_fmeasure_stderr": 0.002092415000785059, "rouge1_precision": 0.0770951414018354, "rouge1_precision_stderr": 0.0016045271434282637, "rouge1_recall": 0.32071402180605507, "rouge1_recall_stderr": 0.004605078296811788, "rouge2_fmeasure": 0.05551128171851163, "rouge2_fmeasure_stderr": 0.0013139706655432171, "rouge2_precision": 0.036415277664524216, "rouge2_precision_stderr": 0.0009758440561531726, "rouge2_recall": 0.15772225366490272, "rouge2_recall_stderr": 0.0032080616090033745, "rougeL_fmeasure": 0.11225387153032584, "rougeL_fmeasure_stderr": 0.0019226335441299288, "rougeL_precision": 0.07386003315833899, "rougeL_precision_stderr": 0.0014481551839227544, "rougeL_recall": 0.3123826157382853, "rougeL_recall_stderr": 0.0045001843968478835, "rougeLsum_fmeasure": 0.11169551049560707, "rougeLsum_fmeasure_stderr": 0.0019597969064310676, "rougeLsum_precision": 0.07376812234473101, "rougeLsum_precision_stderr": 0.0014993347968782623, "rougeLsum_recall": 0.3075886721444383, "rougeLsum_recall_stderr": 0.004330300847827791}}, "1": {"PALM_prompt": {"bleu": 0.5955492347128004, "bleu_stderr": 0.04098243772631065, "rouge1_fmeasure": 0.12834095324411207, "rouge1_fmeasure_stderr": 0.0019248518716807731, "rouge1_precision": 0.08263899702993936, "rouge1_precision_stderr": 0.001483174937834192, "rouge1_recall": 0.4023666815632534, "rouge1_recall_stderr": 0.005245336574571597, "rouge2_fmeasure": 0.061270467324873235, "rouge2_fmeasure_stderr": 0.0012492449538634826, "rouge2_precision": 0.039217284731154656, "rouge2_precision_stderr": 0.0009170392495425271, "rouge2_recall": 0.20543752587672504, "rouge2_recall_stderr": 0.003913508994823848, "rougeL_fmeasure": 0.12158319023743391, "rougeL_fmeasure_stderr": 0.0017288009789448422, "rougeL_precision": 0.07806911458526311, "rougeL_precision_stderr": 0.001322862559894183, "rougeL_recall": 0.3835240834246368, "rougeL_recall_stderr": 0.004954856771984596, "rougeLsum_fmeasure": 0.12124370085382484, "rougeLsum_fmeasure_stderr": 0.0017887287009066164, "rougeLsum_precision": 0.07810015746385479, "rougeLsum_precision_stderr": 0.0013877161994542311, "rougeLsum_recall": 0.38024068911899445, "rougeLsum_recall_stderr": 0.004835861426173133}}, "2": {"PALM_prompt": {"bleu": 0.6719567149622531, "bleu_stderr": 0.04451490063633614, "rouge1_fmeasure": 0.12936180136586783, "rouge1_fmeasure_stderr": 0.001954303430193513, "rouge1_precision": 0.08290888658614914, "rouge1_precision_stderr": 0.0015350215758182237, "rouge1_recall": 0.4194244346322516, "rouge1_recall_stderr": 0.005283499761780351, "rouge2_fmeasure": 0.06151955178238632, "rouge2_fmeasure_stderr": 0.0012577473248259622, "rouge2_precision": 0.03927899079960598, "rouge2_precision_stderr": 0.0009493902598473397, "rouge2_recall": 0.214648184705714, "rouge2_recall_stderr": 0.004017055136123084, "rougeL_fmeasure": 0.12064349526703928, "rougeL_fmeasure_stderr": 0.001720033258665151, "rougeL_precision": 0.07707538779187502, "rougeL_precision_stderr": 0.0013161012683531753, "rougeL_recall": 0.3915395709801734, "rougeL_recall_stderr": 0.0048204418807369295, "rougeLsum_fmeasure": 0.12216673770750221, "rougeLsum_fmeasure_stderr": 0.001800956219634565, "rougeLsum_precision": 0.07820442311124796, "rougeLsum_precision_stderr": 0.0013841071483193569, "rougeLsum_recall": 0.3953219549146526, "rougeLsum_recall_stderr": 0.004864683263901412}}, "3": {"PALM_prompt": {"bleu": 0.7030429320938599, "bleu_stderr": 0.04228909607379892, "rouge1_fmeasure": 0.12494961329088779, "rouge1_fmeasure_stderr": 0.001871934588598472, "rouge1_precision": 0.07917967419538707, "rouge1_precision_stderr": 0.0014240165968682841, "rouge1_recall": 0.42024267058618486, "rouge1_recall_stderr": 0.005232177743876596, "rouge2_fmeasure": 0.05835931677377586, "rouge2_fmeasure_stderr": 0.001172617966432863, "rouge2_precision": 0.03673571584796494, "rouge2_precision_stderr": 0.0008485900852921835, "rouge2_recall": 0.21159361761560835, "rouge2_recall_stderr": 0.003896974137970697, "rougeL_fmeasure": 0.11519873729255874, "rougeL_fmeasure_stderr": 0.0016320470107472416, "rougeL_precision": 0.07289709850715816, "rougeL_precision_stderr": 0.001220535361848458, "rougeL_recall": 0.3865171662156251, "rougeL_recall_stderr": 0.004668856621947524, "rougeLsum_fmeasure": 0.11782366546024012, "rougeLsum_fmeasure_stderr": 0.0017256698791084089, "rougeLsum_precision": 0.07468731684101934, "rougeLsum_precision_stderr": 0.0013010266610264402, "rougeLsum_recall": 0.3950249369083203, "rougeLsum_recall_stderr": 0.0047835320890464855}}, "4": {"PALM_prompt": {"bleu": 0.7009084921005813, "bleu_stderr": 0.05429101270641276, "rouge1_fmeasure": 0.12409393489139078, "rouge1_fmeasure_stderr": 0.001785975484122767, "rouge1_precision": 0.0787656137813602, "rouge1_precision_stderr": 0.0014303284095470072, "rouge1_recall": 0.4198316942830002, "rouge1_recall_stderr": 0.0051407465648353945, "rouge2_fmeasure": 0.05863215249774496, "rouge2_fmeasure_stderr": 0.0011389709231231604, "rouge2_precision": 0.03711436336217505, "rouge2_precision_stderr": 0.0009399506148593571, "rouge2_recall": 0.21491818852062866, "rouge2_recall_stderr": 0.003973307675575708, "rougeL_fmeasure": 0.11439156159112489, "rougeL_fmeasure_stderr": 0.0015688033893795825, "rougeL_precision": 0.07260277037447828, "rougeL_precision_stderr": 0.0012668906951770727, "rougeL_recall": 0.38573795956761925, "rougeL_recall_stderr": 0.004591295339002268, "rougeLsum_fmeasure": 0.11724728813607507, "rougeLsum_fmeasure_stderr": 0.001670529322751021, "rougeLsum_precision": 0.0744525796262277, "rougeLsum_precision_stderr": 0.0013359002532122982, "rougeLsum_recall": 0.3952770689440709, "rougeLsum_recall_stderr": 0.004713626458568202}}, "5": {"PALM_prompt": {"bleu": 0.7813605370790605, "bleu_stderr": 0.044236747351497484, "rouge1_fmeasure": 0.12489900535257513, "rouge1_fmeasure_stderr": 0.001892834896809631, "rouge1_precision": 0.07910701465761572, "rouge1_precision_stderr": 0.0014814464979107673, "rouge1_recall": 0.42965787368823066, "rouge1_recall_stderr": 0.005259601751094787, "rouge2_fmeasure": 0.058674755789965206, "rouge2_fmeasure_stderr": 0.0012025616483889573, "rouge2_precision": 0.03705119454588921, "rouge2_precision_stderr": 0.0009271466855999936, "rouge2_recall": 0.2180047518637017, "rouge2_recall_stderr": 0.00400770350410333, "rougeL_fmeasure": 0.11391088630556961, "rougeL_fmeasure_stderr": 0.0016293445333007116, "rougeL_precision": 0.07219818322662526, "rougeL_precision_stderr": 0.0012976343736866773, "rougeL_recall": 0.3913465741327545, "rougeL_recall_stderr": 0.004618454191060434, "rougeLsum_fmeasure": 0.11797028786316605, "rougeLsum_fmeasure_stderr": 0.0017753435966714796, "rougeLsum_precision": 0.07484317706922312, "rougeLsum_precision_stderr": 0.0014014602911312686, "rougeLsum_recall": 0.4051096507534578, "rougeLsum_recall_stderr": 0.004862305392822499}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4207573221891459, "bleu_stderr": 0.060668680742149504, "rouge1_fmeasure": 0.17278267579995407, "rouge1_fmeasure_stderr": 0.001805309989869822, "rouge1_precision": 0.1472006912589573, "rouge1_precision_stderr": 0.0018081532683583343, "rouge1_recall": 0.25280412679250724, "rouge1_recall_stderr": 0.0027283406577109795, "rouge2_fmeasure": 0.033403065447518296, "rouge2_fmeasure_stderr": 0.0008091708062643894, "rouge2_precision": 0.02800882087488719, "rouge2_precision_stderr": 0.0007031555613566064, "rouge2_recall": 0.05161999521321001, "rouge2_recall_stderr": 0.0014471575437312986, "rougeL_fmeasure": 0.13668346985187405, "rougeL_fmeasure_stderr": 0.0013073897309746026, "rougeL_precision": 0.1151686636057282, "rougeL_precision_stderr": 0.0012754431121468031, "rougeL_recall": 0.20439066404636233, "rougeL_recall_stderr": 0.002231774421791313, "rougeLsum_fmeasure": 0.15765023763708086, "rougeLsum_fmeasure_stderr": 0.001625669978151679, "rougeLsum_precision": 0.1342468034278481, "rougeLsum_precision_stderr": 0.0016375746774704983, "rougeLsum_recall": 0.2313321529160814, "rougeLsum_recall_stderr": 0.002494421485097423}}, "1": {"tldr_en": {"bleu": 2.863520294428325, "bleu_stderr": 0.04821407211996815, "rouge1_fmeasure": 0.22635289032885908, "rouge1_fmeasure_stderr": 0.0019797732821072913, "rouge1_precision": 0.19794818451477075, "rouge1_precision_stderr": 0.002216007848664949, "rouge1_recall": 0.3284489945851762, "rouge1_recall_stderr": 0.002997251491577378, "rouge2_fmeasure": 0.057255731342826154, "rouge2_fmeasure_stderr": 0.0010651766443270621, "rouge2_precision": 0.05003816092030399, "rouge2_precision_stderr": 0.0010440179296527974, "rouge2_recall": 0.0863451197200493, "rouge2_recall_stderr": 0.0018333871572600053, "rougeL_fmeasure": 0.1592439241741607, "rougeL_fmeasure_stderr": 0.0013041393201605942, "rougeL_precision": 0.1385898377468809, "rougeL_precision_stderr": 0.0015132970995651801, "rougeL_recall": 0.23662979084247882, "rougeL_recall_stderr": 0.002337708767980629, "rougeLsum_fmeasure": 0.21134027415429807, "rougeLsum_fmeasure_stderr": 0.0018497593685481146, "rougeLsum_precision": 0.18480829497363066, "rougeLsum_precision_stderr": 0.0020842494081539653, "rougeLsum_recall": 0.30748871703201347, "rougeLsum_recall_stderr": 0.0028452982805044153}}, "2": {"tldr_en": {"bleu": 2.892049675027145, "bleu_stderr": 0.05098830000976219, "rouge1_fmeasure": 0.21621899679871145, "rouge1_fmeasure_stderr": 0.0018861785936639156, "rouge1_precision": 0.19406216075430627, "rouge1_precision_stderr": 0.0022851568065445846, "rouge1_recall": 0.30801257358803485, "rouge1_recall_stderr": 0.002815672688199465, "rouge2_fmeasure": 0.054139172220906376, "rouge2_fmeasure_stderr": 0.0010468476711488718, "rouge2_precision": 0.04969349334752031, "rouge2_precision_stderr": 0.0012211667072253483, "rouge2_recall": 0.07912027885387732, "rouge2_recall_stderr": 0.001712170031568902, "rougeL_fmeasure": 0.15514555559600926, "rougeL_fmeasure_stderr": 0.0013211824425543813, "rougeL_precision": 0.1390654173595737, "rougeL_precision_stderr": 0.0016872675834548336, "rougeL_recall": 0.22587848754502876, "rougeL_recall_stderr": 0.002267886826339719, "rougeLsum_fmeasure": 0.2033687466390169, "rougeLsum_fmeasure_stderr": 0.0017670650217538845, "rougeLsum_precision": 0.18238320998068505, "rougeLsum_precision_stderr": 0.0021483718861069925, "rougeLsum_recall": 0.2906352040127194, "rougeLsum_recall_stderr": 0.0026915444202120025}}, "3": {"tldr_en": {"bleu": 2.9015609655711327, "bleu_stderr": 0.0731100382224477, "rouge1_fmeasure": 0.18012746746417807, "rouge1_fmeasure_stderr": 0.002211658008220921, "rouge1_precision": 0.17111604893495974, "rouge1_precision_stderr": 0.002666234365501737, "rouge1_recall": 0.25241562149299535, "rouge1_recall_stderr": 0.0032749227911704902, "rouge2_fmeasure": 0.04512193425646708, "rouge2_fmeasure_stderr": 0.0010254911232422685, "rouge2_precision": 0.04327701615115013, "rouge2_precision_stderr": 0.001228777572136881, "rouge2_recall": 0.06549662087884625, "rouge2_recall_stderr": 0.0016995103815543647, "rougeL_fmeasure": 0.13030697479391384, "rougeL_fmeasure_stderr": 0.0015820227940213536, "rougeL_precision": 0.12471745040277263, "rougeL_precision_stderr": 0.0020684991761244493, "rougeL_recall": 0.18654805673867839, "rougeL_recall_stderr": 0.002583812297767984, "rougeLsum_fmeasure": 0.1697095188842209, "rougeLsum_fmeasure_stderr": 0.0020776929681688918, "rougeLsum_precision": 0.16128332553163546, "rougeLsum_precision_stderr": 0.0025338005143025694, "rougeLsum_recall": 0.23839128203771087, "rougeLsum_recall_stderr": 0.003113680952409177}}, "4": {"tldr_en": {"bleu": 0.5498632715838676, "bleu_stderr": 0.04532412143006689, "rouge1_fmeasure": 0.0547919221475693, "rouge1_fmeasure_stderr": 0.0018695427841614045, "rouge1_precision": 0.054570512553222275, "rouge1_precision_stderr": 0.0021295344255454545, "rouge1_recall": 0.0795788085275046, "rouge1_recall_stderr": 0.002819526024958868, "rouge2_fmeasure": 0.012956061411719485, "rouge2_fmeasure_stderr": 0.000654310189300764, "rouge2_precision": 0.012963816979253771, "rouge2_precision_stderr": 0.0008570339066074451, "rouge2_recall": 0.020089181351346234, "rouge2_recall_stderr": 0.0011359068136854778, "rougeL_fmeasure": 0.041080312014145574, "rougeL_fmeasure_stderr": 0.001390509184087734, "rougeL_precision": 0.04143340245616873, "rougeL_precision_stderr": 0.001670845773969632, "rougeL_recall": 0.060916796276303795, "rougeL_recall_stderr": 0.0022168298240763207, "rougeLsum_fmeasure": 0.05125820536012792, "rougeLsum_fmeasure_stderr": 0.0017489799459199928, "rougeLsum_precision": 0.05115883129564417, "rougeLsum_precision_stderr": 0.0020129461932352306, "rougeLsum_recall": 0.07453932183957888, "rougeLsum_recall_stderr": 0.0026507114407709974}}, "5": {"tldr_en": {"bleu": 7.095551523492589e-07, "bleu_stderr": 1.483647929837512e-06, "rouge1_fmeasure": 0.009074928433158342, "rouge1_fmeasure_stderr": 0.0008689330426263544, "rouge1_precision": 0.009254019379187583, "rouge1_precision_stderr": 0.0009687953589273818, "rouge1_recall": 0.012906851032002356, "rouge1_recall_stderr": 0.0012462005287005372, "rouge2_fmeasure": 0.0022561594203614308, "rouge2_fmeasure_stderr": 0.00031382246712575974, "rouge2_precision": 0.0024187488668729984, "rouge2_precision_stderr": 0.00044284833188613796, "rouge2_recall": 0.0031816545395102693, "rouge2_recall_stderr": 0.0004382892858640001, "rougeL_fmeasure": 0.006803875534069048, "rougeL_fmeasure_stderr": 0.0006705094104727261, "rougeL_precision": 0.0070282827330193105, "rougeL_precision_stderr": 0.0007783399245473895, "rougeL_recall": 0.009802719504705336, "rougeL_recall_stderr": 0.0009814452084083668, "rougeLsum_fmeasure": 0.00846353027326687, "rougeLsum_fmeasure_stderr": 0.0008128006829974675, "rougeLsum_precision": 0.008596891531763053, "rougeLsum_precision_stderr": 0.0009017346586956089, "rougeLsum_recall": 0.012142210310286464, "rougeLsum_recall_stderr": 0.0011816152654071754}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.4098344694476387, "bleu_stderr": 0.05659734906094685, "rouge1_fmeasure": 0.048385938607711065, "rouge1_fmeasure_stderr": 0.0009554921914241868, "rouge1_precision": 0.08028081330149647, "rouge1_precision_stderr": 0.0015384112652978448, "rouge1_recall": 0.04939996548808655, "rouge1_recall_stderr": 0.0013089989862407679, "rouge2_fmeasure": 0.0024384647317544604, "rouge2_fmeasure_stderr": 0.00023457941137137737, "rouge2_precision": 0.0027541218407848736, "rouge2_precision_stderr": 0.00035092238584102253, "rouge2_recall": 0.0033678792789505574, "rouge2_recall_stderr": 0.00032239765676952145, "rougeL_fmeasure": 0.0477602327181495, "rougeL_fmeasure_stderr": 0.0009369815165680327, "rougeL_precision": 0.07933411355241447, "rougeL_precision_stderr": 0.0015031308017294464, "rougeL_recall": 0.04857484893118309, "rougeL_recall_stderr": 0.0012730023314328899, "rougeLsum_fmeasure": 0.046214076424090866, "rougeLsum_fmeasure_stderr": 0.0008828068716652509, "rougeLsum_precision": 0.07839335233527917, "rougeLsum_precision_stderr": 0.0014996927491176875, "rougeLsum_recall": 0.04630917456025576, "rougeLsum_recall_stderr": 0.0011796155371411761}}, "1": {"generate_text_restaurant": {"bleu": 11.566579816189526, "bleu_stderr": 0.1008934461871328, "rouge1_fmeasure": 0.44235732853210524, "rouge1_fmeasure_stderr": 0.0023941703294020514, "rouge1_precision": 0.5476584258558421, "rouge1_precision_stderr": 0.003456480736532219, "rouge1_recall": 0.41369520299915363, "rouge1_recall_stderr": 0.0030436852499156355, "rouge2_fmeasure": 0.2090655371625227, "rouge2_fmeasure_stderr": 0.0019982452977907856, "rouge2_precision": 0.26480819367826736, "rouge2_precision_stderr": 0.002815017462568761, "rouge2_recall": 0.1947916755785088, "rouge2_recall_stderr": 0.0021369452000412747, "rougeL_fmeasure": 0.3244139780407101, "rougeL_fmeasure_stderr": 0.0020695436330019803, "rougeL_precision": 0.40566221817688664, "rougeL_precision_stderr": 0.003117710931571736, "rougeL_recall": 0.30207439523525703, "rougeL_recall_stderr": 0.0024361075306840673, "rougeLsum_fmeasure": 0.3633576021387155, "rougeLsum_fmeasure_stderr": 0.0022990935361642674, "rougeLsum_precision": 0.4515596659057806, "rougeLsum_precision_stderr": 0.00330045757816753, "rougeLsum_recall": 0.3391748876129398, "rougeLsum_recall_stderr": 0.002736309734827984}}, "2": {"generate_text_restaurant": {"bleu": 13.388773838263758, "bleu_stderr": 0.15855873725678352, "rouge1_fmeasure": 0.4668734222692764, "rouge1_fmeasure_stderr": 0.002246846357740528, "rouge1_precision": 0.5789198401113218, "rouge1_precision_stderr": 0.0034646581707259965, "rouge1_recall": 0.43386579914021006, "rouge1_recall_stderr": 0.0028978275527303445, "rouge2_fmeasure": 0.23283198535704958, "rouge2_fmeasure_stderr": 0.0020561178954031285, "rouge2_precision": 0.29695825419975175, "rouge2_precision_stderr": 0.003050205383079884, "rouge2_recall": 0.21537221020252503, "rouge2_recall_stderr": 0.002178321767423481, "rougeL_fmeasure": 0.35447078432499324, "rougeL_fmeasure_stderr": 0.0021030174285698792, "rougeL_precision": 0.4435501207130385, "rougeL_precision_stderr": 0.0033174896772021436, "rougeL_recall": 0.3283330300619082, "rougeL_recall_stderr": 0.0024660245741253578, "rougeLsum_fmeasure": 0.39454340194994386, "rougeLsum_fmeasure_stderr": 0.0023023116088308807, "rougeLsum_precision": 0.490510155822524, "rougeLsum_precision_stderr": 0.003436994721582442, "rougeLsum_recall": 0.3662609564143789, "rougeLsum_recall_stderr": 0.0027327897367931055}}, "3": {"generate_text_restaurant": {"bleu": 14.332946272100804, "bleu_stderr": 0.18137970117023247, "rouge1_fmeasure": 0.47410403700383463, "rouge1_fmeasure_stderr": 0.0022235093842193814, "rouge1_precision": 0.5793041287934014, "rouge1_precision_stderr": 0.003372992411703443, "rouge1_recall": 0.4438630375906513, "rouge1_recall_stderr": 0.0029246450295944823, "rouge2_fmeasure": 0.24214353888062579, "rouge2_fmeasure_stderr": 0.0020914337667352996, "rouge2_precision": 0.30259452883798293, "rouge2_precision_stderr": 0.002973225867205113, "rouge2_recall": 0.2262470005601965, "rouge2_recall_stderr": 0.002281252913319087, "rougeL_fmeasure": 0.3610728360004429, "rougeL_fmeasure_stderr": 0.0021477160139884146, "rougeL_precision": 0.4446517195178283, "rougeL_precision_stderr": 0.0032644622543037123, "rougeL_recall": 0.33699099215579725, "rougeL_recall_stderr": 0.0025380393382758446, "rougeLsum_fmeasure": 0.4027299671398636, "rougeLsum_fmeasure_stderr": 0.002308027196519871, "rougeLsum_precision": 0.49295247754853494, "rougeLsum_precision_stderr": 0.003364958172666197, "rougeLsum_recall": 0.376746301427234, "rougeLsum_recall_stderr": 0.0027787439904039756}}, "4": {"generate_text_restaurant": {"bleu": 14.060665357528471, "bleu_stderr": 0.26445377512594503, "rouge1_fmeasure": 0.4710196471035029, "rouge1_fmeasure_stderr": 0.00223956317877308, "rouge1_precision": 0.5812217022092131, "rouge1_precision_stderr": 0.0034309298270467957, "rouge1_recall": 0.43567025831059786, "rouge1_recall_stderr": 0.002843591723717016, "rouge2_fmeasure": 0.2391776138295886, "rouge2_fmeasure_stderr": 0.002129468559557926, "rouge2_precision": 0.3024182514661336, "rouge2_precision_stderr": 0.003059613249842079, "rouge2_recall": 0.22039519212546937, "rouge2_recall_stderr": 0.0022450567626817208, "rougeL_fmeasure": 0.3592882674955681, "rougeL_fmeasure_stderr": 0.002128255611143205, "rougeL_precision": 0.44652148397509384, "rougeL_precision_stderr": 0.003277961495936333, "rougeL_recall": 0.33135080522129373, "rougeL_recall_stderr": 0.0024514490127927014, "rougeLsum_fmeasure": 0.4001617252953104, "rougeLsum_fmeasure_stderr": 0.0023226135429815936, "rougeLsum_precision": 0.4943861067221433, "rougeLsum_precision_stderr": 0.003406003991655421, "rougeLsum_recall": 0.3698962329844, "rougeLsum_recall_stderr": 0.002720553601988623}}, "5": {"generate_text_restaurant": {"bleu": 14.308593309054016, "bleu_stderr": 0.14306952311664156, "rouge1_fmeasure": 0.47488846126671874, "rouge1_fmeasure_stderr": 0.0021786011409350068, "rouge1_precision": 0.5775681396669916, "rouge1_precision_stderr": 0.003331795685464776, "rouge1_recall": 0.4402215693743334, "rouge1_recall_stderr": 0.0027531550513425758, "rouge2_fmeasure": 0.24094008936533884, "rouge2_fmeasure_stderr": 0.0020801259022813117, "rouge2_precision": 0.29979953949402255, "rouge2_precision_stderr": 0.0029503967662262848, "rouge2_recall": 0.2224041099843757, "rouge2_recall_stderr": 0.0021904880519093752, "rougeL_fmeasure": 0.36154908902057886, "rougeL_fmeasure_stderr": 0.0021160968101490625, "rougeL_precision": 0.44250221759721664, "rougeL_precision_stderr": 0.0031886562916719433, "rougeL_recall": 0.33425438519938394, "rougeL_recall_stderr": 0.002422039927744084, "rougeLsum_fmeasure": 0.40460308229003333, "rougeLsum_fmeasure_stderr": 0.002297073510569085, "rougeLsum_precision": 0.49255025128439295, "rougeLsum_precision_stderr": 0.003325987333005444, "rougeLsum_recall": 0.3748555131665994, "rougeLsum_recall_stderr": 0.002667744873458217}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9479708786806795, "bleu_stderr": 0.08014965365678729, "rouge1_fmeasure": 0.2087853046997779, "rouge1_fmeasure_stderr": 0.0025328837093340183, "rouge1_precision": 0.1613424323587368, "rouge1_precision_stderr": 0.002278376950302783, "rouge1_recall": 0.3330533490420518, "rouge1_recall_stderr": 0.004383443983456259, "rouge2_fmeasure": 0.043870150801107066, "rouge2_fmeasure_stderr": 0.0015541821908238338, "rouge2_precision": 0.03314422437427033, "rouge2_precision_stderr": 0.0012522698778844542, "rouge2_recall": 0.07302060035369815, "rouge2_recall_stderr": 0.0026623802488437657, "rougeL_fmeasure": 0.1562890353716468, "rougeL_fmeasure_stderr": 0.0019283406025933057, "rougeL_precision": 0.12067382175182433, "rougeL_precision_stderr": 0.0017342073690786556, "rougeL_recall": 0.25037736095962637, "rougeL_recall_stderr": 0.0034248848896776688, "rougeLsum_fmeasure": 0.16248318414135096, "rougeLsum_fmeasure_stderr": 0.0021310634283090322, "rougeLsum_precision": 0.12496312472870656, "rougeLsum_precision_stderr": 0.001832956489326573, "rougeLsum_recall": 0.2617586907885203, "rougeLsum_recall_stderr": 0.0038506762926574915}}, "1": {"article_DOC_summary": {"bleu": 1.6036382509204854, "bleu_stderr": 0.07096687163435135, "rouge1_fmeasure": 0.1865319266668177, "rouge1_fmeasure_stderr": 0.002524634111297582, "rouge1_precision": 0.13272782271590566, "rouge1_precision_stderr": 0.0018764384818534341, "rouge1_recall": 0.3267515350090506, "rouge1_recall_stderr": 0.004353020039146078, "rouge2_fmeasure": 0.04075132217381377, "rouge2_fmeasure_stderr": 0.0014397841016716487, "rouge2_precision": 0.0286660681091914, "rouge2_precision_stderr": 0.001013349141668393, "rouge2_recall": 0.07362289198488865, "rouge2_recall_stderr": 0.002707772359921043, "rougeL_fmeasure": 0.14475331879083578, "rougeL_fmeasure_stderr": 0.0018814005333497661, "rougeL_precision": 0.10272547144552927, "rougeL_precision_stderr": 0.0013800002526758255, "rougeL_recall": 0.25572197637104055, "rougeL_recall_stderr": 0.003442070323824128, "rougeLsum_fmeasure": 0.14751402177418876, "rougeLsum_fmeasure_stderr": 0.0020984330928484275, "rougeLsum_precision": 0.10462377645973141, "rougeLsum_precision_stderr": 0.0015282810270458372, "rougeLsum_recall": 0.260731647450524, "rougeLsum_recall_stderr": 0.0038051114474756866}}, "2": {"article_DOC_summary": {"bleu": 1.5160558945216587, "bleu_stderr": 0.08240737955012244, "rouge1_fmeasure": 0.18378435325514345, "rouge1_fmeasure_stderr": 0.002535460776839407, "rouge1_precision": 0.13076052871501112, "rouge1_precision_stderr": 0.0018792476759405042, "rouge1_recall": 0.3221325006706253, "rouge1_recall_stderr": 0.004397231507997294, "rouge2_fmeasure": 0.038998510414389495, "rouge2_fmeasure_stderr": 0.0014233539161123735, "rouge2_precision": 0.02746291848253061, "rouge2_precision_stderr": 0.0010036620622712227, "rouge2_recall": 0.07033120823579617, "rouge2_recall_stderr": 0.002679121665145769, "rougeL_fmeasure": 0.14281263909344286, "rougeL_fmeasure_stderr": 0.0018586972576843518, "rougeL_precision": 0.10140981704363383, "rougeL_precision_stderr": 0.0013659104358610527, "rougeL_recall": 0.2519770493702287, "rougeL_recall_stderr": 0.003389522634128558, "rougeLsum_fmeasure": 0.14511393337009704, "rougeLsum_fmeasure_stderr": 0.0020497175410351316, "rougeLsum_precision": 0.10292716745077025, "rougeLsum_precision_stderr": 0.001493393265112054, "rougeLsum_recall": 0.2564707137279286, "rougeLsum_recall_stderr": 0.0037227133454889715}}, "3": {"article_DOC_summary": {"bleu": 1.4838319001573719, "bleu_stderr": 0.096950687868892, "rouge1_fmeasure": 0.1755472401383986, "rouge1_fmeasure_stderr": 0.0026658088779740008, "rouge1_precision": 0.12752846466949222, "rouge1_precision_stderr": 0.002107108659509441, "rouge1_recall": 0.30226873054413106, "rouge1_recall_stderr": 0.004603263404573647, "rouge2_fmeasure": 0.03728687717351484, "rouge2_fmeasure_stderr": 0.0014073859223946697, "rouge2_precision": 0.02656552305156673, "rouge2_precision_stderr": 0.0010143773775889025, "rouge2_recall": 0.06679540752654423, "rouge2_recall_stderr": 0.0026398777724888113, "rougeL_fmeasure": 0.1374413682741159, "rougeL_fmeasure_stderr": 0.0020025230497704035, "rougeL_precision": 0.09973634913453447, "rougeL_precision_stderr": 0.0016065186547029968, "rougeL_recall": 0.2381694226551552, "rougeL_recall_stderr": 0.003581000725397854, "rougeLsum_fmeasure": 0.13972451850744902, "rougeLsum_fmeasure_stderr": 0.002189941237196646, "rougeLsum_precision": 0.10128532281127449, "rougeLsum_precision_stderr": 0.0017234587079133783, "rougeLsum_recall": 0.242387646401674, "rougeLsum_recall_stderr": 0.003912550948546174}}, "4": {"article_DOC_summary": {"bleu": 0.7569692825353468, "bleu_stderr": 0.08589568488878548, "rouge1_fmeasure": 0.04778183046984643, "rouge1_fmeasure_stderr": 0.0026876876154109395, "rouge1_precision": 0.04055839800057727, "rouge1_precision_stderr": 0.002477310590755891, "rouge1_recall": 0.075289351379979, "rouge1_recall_stderr": 0.004369683499123559, "rouge2_fmeasure": 0.009890919687627312, "rouge2_fmeasure_stderr": 0.0009266580590961874, "rouge2_precision": 0.007387682998651727, "rouge2_precision_stderr": 0.0006990296012429621, "rouge2_recall": 0.01658715369278413, "rouge2_recall_stderr": 0.0015969348183973628, "rougeL_fmeasure": 0.037129528917459374, "rougeL_fmeasure_stderr": 0.002067815641725132, "rougeL_precision": 0.03201249080708822, "rougeL_precision_stderr": 0.0020286497636226534, "rougeL_recall": 0.058963105965077874, "rougeL_recall_stderr": 0.0034421636677800026, "rougeLsum_fmeasure": 0.03818253053785788, "rougeLsum_fmeasure_stderr": 0.0021629296440312594, "rougeLsum_precision": 0.032870357769899584, "rougeLsum_precision_stderr": 0.0020908192203781383, "rougeLsum_recall": 0.060477341143397076, "rougeLsum_recall_stderr": 0.0035760110748676577}}, "5": {"article_DOC_summary": {"bleu": 4.750583931400068e-39, "bleu_stderr": 8.063749129795418e-34, "rouge1_fmeasure": 0.002428042108949549, "rouge1_fmeasure_stderr": 0.0006836828342851362, "rouge1_precision": 0.0027310061588494907, "rouge1_precision_stderr": 0.0007639869380554712, "rouge1_recall": 0.002278390688529166, "rouge1_recall_stderr": 0.000653161458483614, "rouge2_fmeasure": 0.0002457908682882255, "rouge2_fmeasure_stderr": 0.00013419496972284081, "rouge2_precision": 0.0002712711508493972, "rouge2_precision_stderr": 0.00014357866607635477, "rouge2_recall": 0.00022836178496555851, "rouge2_recall_stderr": 0.00012848868152370965, "rougeL_fmeasure": 0.0017821424924565805, "rougeL_fmeasure_stderr": 0.0005116719884636547, "rougeL_precision": 0.0019627628749802942, "rougeL_precision_stderr": 0.0005482087057123049, "rougeL_recall": 0.0017131282019251252, "rougeL_recall_stderr": 0.0005120016166906651, "rougeLsum_fmeasure": 0.0020181296932120095, "rougeLsum_fmeasure_stderr": 0.0005652424749866139, "rougeLsum_precision": 0.0022448445249864724, "rougeLsum_precision_stderr": 0.0006188549077092995, "rougeLsum_recall": 0.001918332943289463, "rougeLsum_recall_stderr": 0.0005536248619504823}}}} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..552cb62288a3bf167c37a7f2df161e5255f696f3 --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541037,0 +anli_r2,acc,0.332,0.014899597242811476,0 +anli_r3,acc,0.35333333333333333,0.013804572162314932,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545794,0 +arc_challenge,acc_norm,0.30716723549488056,0.013481034054980943,0 +arc_easy,acc,0.5862794612794613,0.010105878530238132,0 +arc_easy,acc_norm,0.5168350168350169,0.010253966261288895,0 +boolq,acc,0.6058103975535168,0.008546995661233635,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.1920045045045045,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4814777932682733,0.004986356526063966,0 +hellaswag,acc_norm,0.6333399721171081,0.004809077205343496,0 +piqa,acc,0.7421109902067464,0.01020695666205625,0 +piqa,acc_norm,0.7578890097932536,0.009994371269104397,0 +rte,acc,0.5631768953068592,0.029855247390314945,0 +sciq,acc,0.846,0.011419913065098704,0 +sciq,acc_norm,0.766,0.01339490288966001,0 +storycloze_2016,acc,0.7220737573490112,0.010359403651225856,0 +winogrande,acc,0.5919494869771112,0.013812822643745027,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json deleted file mode 100644 index 75343732033842b1fea79f981f5e728a7711897c..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541037 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811476 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314932 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.1920045045045045 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4814777932682733, - "acc_stderr": 0.004986356526063966, - "acc_norm": 0.6333399721171081, - "acc_norm_stderr": 0.004809077205343496 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.029855247390314945 - }, - "winogrande": { - "acc": 0.5919494869771112, - "acc_stderr": 0.013812822643745027 - }, - "storycloze_2016": { - "acc": 0.7220737573490112, - "acc_stderr": 0.010359403651225856 - }, - "boolq": { - "acc": 0.6058103975535168, - "acc_stderr": 0.008546995661233635 - }, - "arc_easy": { - "acc": 0.5862794612794613, - "acc_stderr": 0.010105878530238132, - "acc_norm": 0.5168350168350169, - "acc_norm_stderr": 0.010253966261288895 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545794, - "acc_norm": 0.30716723549488056, - "acc_norm_stderr": 0.013481034054980943 - }, - "sciq": { - "acc": 0.846, - "acc_stderr": 0.011419913065098704, - "acc_norm": 0.766, - "acc_norm_stderr": 0.01339490288966001 - }, - "piqa": { - "acc": 0.7421109902067464, - "acc_stderr": 0.01020695666205625, - "acc_norm": 0.7578890097932536, - "acc_norm_stderr": 0.009994371269104397 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..be87806be28245ac85d07ef68347fc3cd8401e7d --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928367,0 +anli_r2,acc,0.346,0.01505026612756444,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009031,0 +arc_challenge,acc_norm,0.30887372013651876,0.013501770929344,0 +arc_easy,acc,0.6136363636363636,0.00999129677815962,0 +arc_easy,acc_norm,0.5765993265993266,0.01013867100528905,0 +boolq,acc,0.5957186544342508,0.008583313811372065,1 +cb,acc,0.48214285714285715,0.06737697508644648,1 +cb,f1,0.33543417366946776,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.47699661422027484,0.004984497871025248,0 +hellaswag,acc_norm,0.6310495917147978,0.004815343349305197,0 +piqa,acc,0.7470076169749728,0.01014288869886246,0 +piqa,acc_norm,0.7551686615886833,0.010032309105568802,0 +rte,acc,0.5306859205776173,0.030039730592197816,0 +sciq,acc,0.881,0.010244215145336662,0 +sciq,acc_norm,0.859,0.011010914595992448,0 +storycloze_2016,acc,0.711918760021379,0.010472537019822583,0 +winogrande,acc,0.5927387529597474,0.013808654122417855,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json deleted file mode 100644 index 9c204fb86901a4f1348481c91fb089d5edd307e5..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928367 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.01505026612756444 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.06737697508644648, - "f1": 0.33543417366946776 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.47699661422027484, - "acc_stderr": 0.004984497871025248, - "acc_norm": 0.6310495917147978, - "acc_norm_stderr": 0.004815343349305197 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197816 - }, - "winogrande": { - "acc": 0.5927387529597474, - "acc_stderr": 0.013808654122417855 - }, - "storycloze_2016": { - "acc": 0.711918760021379, - "acc_stderr": 0.010472537019822583 - }, - "boolq": { - "acc": 0.5957186544342508, - "acc_stderr": 0.008583313811372065 - }, - "arc_easy": { - "acc": 0.6136363636363636, - "acc_stderr": 0.00999129677815962, - "acc_norm": 0.5765993265993266, - "acc_norm_stderr": 0.01013867100528905 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009031, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344 - }, - "sciq": { - "acc": 0.881, - "acc_stderr": 0.010244215145336662, - "acc_norm": 0.859, - "acc_norm_stderr": 0.011010914595992448 - }, - "piqa": { - "acc": 0.7470076169749728, - "acc_stderr": 0.01014288869886246, - "acc_norm": 0.7551686615886833, - "acc_norm_stderr": 0.010032309105568802 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..51958dccbd7a5120ea1daa23b7e3480f862bf445 --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.336,0.014944140233795023,0 +anli_r2,acc,0.339,0.014976758771620347,0 +anli_r3,acc,0.30916666666666665,0.013346684134591957,0 +arc_challenge,acc,0.28754266211604096,0.01322671905626613,0 +arc_challenge,acc_norm,0.31569965870307165,0.013582571095815291,0 +arc_easy,acc,0.6224747474747475,0.009947227833469432,0 +arc_easy,acc_norm,0.601010101010101,0.010048240683798743,0 +boolq,acc,0.6131498470948012,0.008518188340844743,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.3117283950617284,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.47759410476000796,0.004984768912326932,0 +hellaswag,acc_norm,0.6331408086038638,0.0048096267236268425,0 +piqa,acc,0.750816104461371,0.010091882770120214,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5451263537906137,0.029973636495415255,0 +sciq,acc,0.895,0.009698921026024964,0 +sciq,acc_norm,0.881,0.010244215145336664,0 +storycloze_2016,acc,0.7199358631747729,0.01038376499392048,0 +winogrande,acc,0.5951065509076559,0.013795927003124939,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-01_2shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-01_2shots_backup.json deleted file mode 100644 index a0fe6746923f9316620b2f84affae1bde8b0c3b4..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_2_lm-eval_global_step80108_2023-02-15-11-04-01_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.336, - "acc_stderr": 0.014944140233795023 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.014976758771620347 - }, - "anli_r3": { - "acc": 0.30916666666666665, - "acc_stderr": 0.013346684134591957 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.3117283950617284 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.47759410476000796, - "acc_stderr": 0.004984768912326932, - "acc_norm": 0.6331408086038638, - "acc_norm_stderr": 0.0048096267236268425 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415255 - }, - "winogrande": { - "acc": 0.5951065509076559, - "acc_stderr": 0.013795927003124939 - }, - "storycloze_2016": { - "acc": 0.7199358631747729, - "acc_stderr": 0.01038376499392048 - }, - "boolq": { - "acc": 0.6131498470948012, - "acc_stderr": 0.008518188340844743 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.009947227833469432, - "acc_norm": 0.601010101010101, - "acc_norm_stderr": 0.010048240683798743 - }, - "arc_challenge": { - "acc": 0.28754266211604096, - "acc_stderr": 0.01322671905626613, - "acc_norm": 0.31569965870307165, - "acc_norm_stderr": 0.013582571095815291 - }, - "sciq": { - "acc": 0.895, - "acc_stderr": 0.009698921026024964, - "acc_norm": 0.881, - "acc_norm_stderr": 0.010244215145336664 - }, - "piqa": { - "acc": 0.750816104461371, - "acc_stderr": 0.010091882770120214, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..3e8cd0022309a5b16e63603c1bfa956aa9c0ca06 --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203933,0 +anli_r2,acc,0.313,0.014671272822977888,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.2841296928327645,0.013179442447653887,0 +arc_challenge,acc_norm,0.3174061433447099,0.01360223908803817,0 +arc_easy,acc,0.6212121212121212,0.009953737656542037,0 +arc_easy,acc_norm,0.6153198653198653,0.009983171707008999,0 +boolq,acc,0.617125382262997,0.008501734385335954,1 +cb,acc,0.35714285714285715,0.06460957383809218,1 +cb,f1,0.26080246913580246,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.47769368651663013,0.004984813391016206,0 +hellaswag,acc_norm,0.6324437363075085,0.004811543077792729,0 +piqa,acc,0.7464635473340587,0.010150090834551786,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267314,0 +rte,acc,0.5703971119133574,0.02979666882912467,0 +sciq,acc,0.905,0.009276910103103306,0 +sciq,acc_norm,0.893,0.009779910359847167,0 +storycloze_2016,acc,0.7231427044361304,0.01034711289027693,0 +winogrande,acc,0.5777426992896606,0.013881582030658543,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json deleted file mode 100644 index 042d813d0855c313e4c26b99fb9e12f0bedf62eb..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203933 - }, - "anli_r2": { - "acc": 0.313, - "acc_stderr": 0.014671272822977888 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809218, - "f1": 0.26080246913580246 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.47769368651663013, - "acc_stderr": 0.004984813391016206, - "acc_norm": 0.6324437363075085, - "acc_norm_stderr": 0.004811543077792729 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.02979666882912467 - }, - "winogrande": { - "acc": 0.5777426992896606, - "acc_stderr": 0.013881582030658543 - }, - "storycloze_2016": { - "acc": 0.7231427044361304, - "acc_stderr": 0.01034711289027693 - }, - "boolq": { - "acc": 0.617125382262997, - "acc_stderr": 0.008501734385335954 - }, - "arc_easy": { - "acc": 0.6212121212121212, - "acc_stderr": 0.009953737656542037, - "acc_norm": 0.6153198653198653, - "acc_norm_stderr": 0.009983171707008999 - }, - "arc_challenge": { - "acc": 0.2841296928327645, - "acc_stderr": 0.013179442447653887, - "acc_norm": 0.3174061433447099, - "acc_norm_stderr": 0.01360223908803817 - }, - "sciq": { - "acc": 0.905, - "acc_stderr": 0.009276910103103306, - "acc_norm": 0.893, - "acc_norm_stderr": 0.009779910359847167 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551786, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267314 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..afa29b8113648c4772a5f0f1661f733669f95a5f --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620335,0 +anli_r2,acc,0.342,0.01500870618212173,0 +anli_r3,acc,0.3333333333333333,0.0136139500102256,0 +arc_challenge,acc,0.29692832764505117,0.013352025976725222,0 +arc_challenge,acc_norm,0.33276450511945393,0.013769863046192312,0 +arc_easy,acc,0.6279461279461279,0.009918187193096466,0 +arc_easy,acc_norm,0.6132154882154882,0.009993308355370965,0 +boolq,acc,0.6250764525993884,0.008467017704333002,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.35582970488630866,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4758016331408086,0.004983934343250461,0 +hellaswag,acc_norm,0.6377215694084843,0.004796763521045232,0 +piqa,acc,0.7464635473340587,0.010150090834551791,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804456,0 +rte,acc,0.5342960288808665,0.030025579819366422,0 +sciq,acc,0.901,0.009449248027662744,0 +sciq,acc_norm,0.892,0.009820001651345705,0 +storycloze_2016,acc,0.7226082308925709,0.010353267472010775,0 +winogrande,acc,0.5911602209944752,0.013816954295135688,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json deleted file mode 100644 index 1cdb7f5459c0d00d99a81b214f70d0d272293da1..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_4_lm-eval_global_step80108_2023-02-15-11-04-02_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620335 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.01500870618212173 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.0136139500102256 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.35582970488630866 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4758016331408086, - "acc_stderr": 0.004983934343250461, - "acc_norm": 0.6377215694084843, - "acc_norm_stderr": 0.004796763521045232 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366422 - }, - "winogrande": { - "acc": 0.5911602209944752, - "acc_stderr": 0.013816954295135688 - }, - "storycloze_2016": { - "acc": 0.7226082308925709, - "acc_stderr": 0.010353267472010775 - }, - "boolq": { - "acc": 0.6250764525993884, - "acc_stderr": 0.008467017704333002 - }, - "arc_easy": { - "acc": 0.6279461279461279, - "acc_stderr": 0.009918187193096466, - "acc_norm": 0.6132154882154882, - "acc_norm_stderr": 0.009993308355370965 - }, - "arc_challenge": { - "acc": 0.29692832764505117, - "acc_stderr": 0.013352025976725222, - "acc_norm": 0.33276450511945393, - "acc_norm_stderr": 0.013769863046192312 - }, - "sciq": { - "acc": 0.901, - "acc_stderr": 0.009449248027662744, - "acc_norm": 0.892, - "acc_norm_stderr": 0.009820001651345705 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551791, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804456 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5.csv b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..58bc784526e35046dd673a24eee27b0e9cd87607 --- /dev/null +++ b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.346,0.015050266127564441,0 +anli_r2,acc,0.336,0.014944140233795027,0 +anli_r3,acc,0.32166666666666666,0.01349009528298952,0 +arc_challenge,acc,0.29948805460750855,0.01338502163731357,0 +arc_challenge,acc_norm,0.3216723549488055,0.013650488084494164,0 +arc_easy,acc,0.6338383838383839,0.009885391390947724,0 +arc_easy,acc_norm,0.6199494949494949,0.009960175831493126,0 +boolq,acc,0.6238532110091743,0.008472516562330721,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2872985170857511,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.476000796654053,0.0049840302505072915,0 +hellaswag,acc_norm,0.6396136227843059,0.004791313101877042,0 +piqa,acc,0.7480957562568009,0.010128421335088683,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858917,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.909,0.009099549538400236,0 +sciq,acc_norm,0.901,0.009449248027662734,0 +storycloze_2016,acc,0.7268840192410476,0.010303512765124681,0 +winogrande,acc,0.5887924230465666,0.013829128358676862,0 diff --git a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json b/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json deleted file mode 100644 index 8716e16bc79e9fff0c05fdec9a73939f46f9fbdd..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed1/evaluation/rankeval/4b284b28bc4seed1_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.346, - "acc_stderr": 0.015050266127564441 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795027 - }, - "anli_r3": { - "acc": 0.32166666666666666, - "acc_stderr": 0.01349009528298952 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2872985170857511 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.476000796654053, - "acc_stderr": 0.0049840302505072915, - "acc_norm": 0.6396136227843059, - "acc_norm_stderr": 0.004791313101877042 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.5887924230465666, - "acc_stderr": 0.013829128358676862 - }, - "storycloze_2016": { - "acc": 0.7268840192410476, - "acc_stderr": 0.010303512765124681 - }, - "boolq": { - "acc": 0.6238532110091743, - "acc_stderr": 0.008472516562330721 - }, - "arc_easy": { - "acc": 0.6338383838383839, - "acc_stderr": 0.009885391390947724, - "acc_norm": 0.6199494949494949, - "acc_norm_stderr": 0.009960175831493126 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.01338502163731357, - "acc_norm": 0.3216723549488055, - "acc_norm_stderr": 0.013650488084494164 - }, - "sciq": { - "acc": 0.909, - "acc_stderr": 0.009099549538400236, - "acc_norm": 0.901, - "acc_norm_stderr": 0.009449248027662734 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088683, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858917 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/generation/merged.csv b/4b284b28bc4seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..a09f86a03f414a8461d07347f71a224754b8c825 --- /dev/null +++ b/4b284b28bc4seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00993468622248673 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00993468622248673 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19587297305804474 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19587297305804474 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.20007291305638683 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.20007291305638683 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.200583189798169 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.200583189798169 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.202879451815592 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.202879451815592 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.20083246520570572 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.20083246520570572 +e2e_nlg_cleaned,5,average,multiple,0.16836261319273083 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.055096743612893836 +gem_xsum,0,median,rouge2_fmeasure,0.055096743612893836 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03416151854233213 +gem_xsum,1,median,rouge2_fmeasure,0.03416151854233213 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03548103421040685 +gem_xsum,2,median,rouge2_fmeasure,0.03548103421040685 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03519128429327673 +gem_xsum,3,median,rouge2_fmeasure,0.03519128429327673 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.008749872613988179 +gem_xsum,4,median,rouge2_fmeasure,0.008749872613988179 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005852654722501501 +gem_xsum,5,median,rouge2_fmeasure,0.0005852654722501501 +gem_xsum,5,average,multiple,0.028210953124191314 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05087917177498001 +web_nlg_en,0,median,rouge2_fmeasure,0.05087917177498001 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05213699889610434 +web_nlg_en,1,median,rouge2_fmeasure,0.05213699889610434 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05321206929632585 +web_nlg_en,2,median,rouge2_fmeasure,0.05321206929632585 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.052818017804867626 +web_nlg_en,3,median,rouge2_fmeasure,0.052818017804867626 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05332935191373514 +web_nlg_en,4,median,rouge2_fmeasure,0.05332935191373514 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05353546076223042 +web_nlg_en,5,median,rouge2_fmeasure,0.05353546076223042 +web_nlg_en,5,average,multiple,0.05265184507470723 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.036223836197762725 +wiki_lingua_en,0,median,rouge2_fmeasure,0.036223836197762725 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05092387173534868 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05092387173534868 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05579012272313604 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05579012272313604 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04536872008122934 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04536872008122934 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013932175935494801 +wiki_lingua_en,4,median,rouge2_fmeasure,0.013932175935494801 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0021689783329266383 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0021689783329266383 +wiki_lingua_en,5,average,multiple,0.03406795083431637 diff --git a/4b284b28bc4seed2/evaluation/generation/merged.json b/4b284b28bc4seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..41245aa064ca6220a4942f5a86f08a16036c6043 --- /dev/null +++ b/4b284b28bc4seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33153388648094195, "bleu_stderr": 0.033005772984634425, "rouge1_fmeasure": 0.10892120808244464, "rouge1_fmeasure_stderr": 0.0019623579408930523, "rouge1_precision": 0.07122303608863641, "rouge1_precision_stderr": 0.0014591848220124259, "rouge1_recall": 0.30715399283349226, "rouge1_recall_stderr": 0.004708720401522559, "rouge2_fmeasure": 0.05087917177498001, "rouge2_fmeasure_stderr": 0.0012052999188796845, "rouge2_precision": 0.03301457183266829, "rouge2_precision_stderr": 0.0008587185375204866, "rouge2_recall": 0.14854387481989287, "rouge2_recall_stderr": 0.003193281253509643, "rougeL_fmeasure": 0.1046596223130991, "rougeL_fmeasure_stderr": 0.0018274939775881785, "rougeL_precision": 0.06820659069603296, "rougeL_precision_stderr": 0.0013384312711305548, "rougeL_recall": 0.29787923352571516, "rougeL_recall_stderr": 0.004614486799091354, "rougeLsum_fmeasure": 0.10379069725870145, "rougeLsum_fmeasure_stderr": 0.0018395610209924003, "rougeLsum_precision": 0.06782306790846614, "rougeLsum_precision_stderr": 0.0013642733366910549, "rougeLsum_recall": 0.29321572118705336, "rougeLsum_recall_stderr": 0.004444992160741671}}, "1": {"PALM_prompt": {"bleu": 0.4390216732028977, "bleu_stderr": 0.02835432276037686, "rouge1_fmeasure": 0.11512333876359498, "rouge1_fmeasure_stderr": 0.0017974110536525595, "rouge1_precision": 0.07291258825421418, "rouge1_precision_stderr": 0.0012781399789461385, "rouge1_recall": 0.3797532045177941, "rouge1_recall_stderr": 0.0054586371665182355, "rouge2_fmeasure": 0.05213699889610434, "rouge2_fmeasure_stderr": 0.0010956033295937493, "rouge2_precision": 0.032868611308100475, "rouge2_precision_stderr": 0.0007540345266181282, "rouge2_recall": 0.18168266286903503, "rouge2_recall_stderr": 0.0037422324762068537, "rougeL_fmeasure": 0.10756044811692632, "rougeL_fmeasure_stderr": 0.001603262453041204, "rougeL_precision": 0.06806601322392085, "rougeL_precision_stderr": 0.0011368722533979258, "rougeL_recall": 0.3553819785767932, "rougeL_recall_stderr": 0.004968399489043776, "rougeLsum_fmeasure": 0.10900979568571047, "rougeLsum_fmeasure_stderr": 0.0016850432132498889, "rougeLsum_precision": 0.06910714253449175, "rougeLsum_precision_stderr": 0.0012031621148103201, "rougeLsum_recall": 0.3583493585506855, "rougeLsum_recall_stderr": 0.005014235277134646}}, "2": {"PALM_prompt": {"bleu": 0.49541929016035524, "bleu_stderr": 0.025797242026580104, "rouge1_fmeasure": 0.11565855599729363, "rouge1_fmeasure_stderr": 0.0016715046204442972, "rouge1_precision": 0.07266725444927156, "rouge1_precision_stderr": 0.0011715118352015903, "rouge1_recall": 0.392286147400258, "rouge1_recall_stderr": 0.005249827988828273, "rouge2_fmeasure": 0.05321206929632585, "rouge2_fmeasure_stderr": 0.001036190867375003, "rouge2_precision": 0.03317260466126409, "rouge2_precision_stderr": 0.000696598360418341, "rouge2_recall": 0.1942947439780873, "rouge2_recall_stderr": 0.0038612870273807783, "rougeL_fmeasure": 0.10849753112690429, "rougeL_fmeasure_stderr": 0.0015324126427550334, "rougeL_precision": 0.06822626222269808, "rougeL_precision_stderr": 0.0010788080778881023, "rougeL_recall": 0.3658441206667177, "rougeL_recall_stderr": 0.004744182177069523, "rougeLsum_fmeasure": 0.10994078611093397, "rougeLsum_fmeasure_stderr": 0.0015743454993528729, "rougeLsum_precision": 0.06911868824793722, "rougeLsum_precision_stderr": 0.0011075456309651759, "rougeLsum_recall": 0.3723196892015273, "rougeLsum_recall_stderr": 0.004901964131441716}}, "3": {"PALM_prompt": {"bleu": 0.5491960579253196, "bleu_stderr": 0.039648429482997495, "rouge1_fmeasure": 0.11408517238759823, "rouge1_fmeasure_stderr": 0.0016665220312756322, "rouge1_precision": 0.07160364848852087, "rouge1_precision_stderr": 0.0011783430176818728, "rouge1_recall": 0.3927842849413568, "rouge1_recall_stderr": 0.00522433426165448, "rouge2_fmeasure": 0.052818017804867626, "rouge2_fmeasure_stderr": 0.0010623028200577843, "rouge2_precision": 0.03296327776337767, "rouge2_precision_stderr": 0.0007230725204977183, "rouge2_recall": 0.19533584043693603, "rouge2_recall_stderr": 0.0038707201775201655, "rougeL_fmeasure": 0.10630291986513662, "rougeL_fmeasure_stderr": 0.001526814085910438, "rougeL_precision": 0.06680503273038688, "rougeL_precision_stderr": 0.0010837322892585253, "rougeL_recall": 0.36333847242647704, "rougeL_recall_stderr": 0.004665099933938704, "rougeLsum_fmeasure": 0.10846020883153136, "rougeLsum_fmeasure_stderr": 0.0015890200535555973, "rougeLsum_precision": 0.0681283239197468, "rougeLsum_precision_stderr": 0.0011259165449605442, "rougeLsum_recall": 0.37225854905210065, "rougeLsum_recall_stderr": 0.004887731193821206}}, "4": {"PALM_prompt": {"bleu": 0.6089661489117988, "bleu_stderr": 0.04699623217901871, "rouge1_fmeasure": 0.11514000344336653, "rouge1_fmeasure_stderr": 0.0016184571331864207, "rouge1_precision": 0.07213518977803757, "rouge1_precision_stderr": 0.0011476197609056422, "rouge1_recall": 0.3992239446300711, "rouge1_recall_stderr": 0.005128894156765551, "rouge2_fmeasure": 0.05332935191373514, "rouge2_fmeasure_stderr": 0.0010277690042407401, "rouge2_precision": 0.033158630338207526, "rouge2_precision_stderr": 0.0006987407020504501, "rouge2_recall": 0.20032190789144683, "rouge2_recall_stderr": 0.0038236156171148734, "rougeL_fmeasure": 0.10644937066363529, "rougeL_fmeasure_stderr": 0.0014847353193029158, "rougeL_precision": 0.06683680650160255, "rougeL_precision_stderr": 0.0010578019098300508, "rougeL_recall": 0.36520991395591773, "rougeL_recall_stderr": 0.004493220490046675, "rougeLsum_fmeasure": 0.10920289975227614, "rougeLsum_fmeasure_stderr": 0.0015389130963739533, "rougeLsum_precision": 0.06848539496662703, "rougeLsum_precision_stderr": 0.001092766850551142, "rougeLsum_recall": 0.3773679563441165, "rougeLsum_recall_stderr": 0.004791768779964645}}, "5": {"PALM_prompt": {"bleu": 0.6220539692203896, "bleu_stderr": 0.027241061650766073, "rouge1_fmeasure": 0.11402382459454527, "rouge1_fmeasure_stderr": 0.0015862357548837988, "rouge1_precision": 0.0712682874127396, "rouge1_precision_stderr": 0.0011261508589404943, "rouge1_recall": 0.40368204669190927, "rouge1_recall_stderr": 0.005176457083171882, "rouge2_fmeasure": 0.05353546076223042, "rouge2_fmeasure_stderr": 0.0010071474225918618, "rouge2_precision": 0.0332020137953197, "rouge2_precision_stderr": 0.0006861186130499352, "rouge2_recall": 0.20677377146703504, "rouge2_recall_stderr": 0.0039525209229281264, "rougeL_fmeasure": 0.10577538472070054, "rougeL_fmeasure_stderr": 0.0014621636026219001, "rougeL_precision": 0.06625798608998291, "rougeL_precision_stderr": 0.0010442629568554243, "rougeL_recall": 0.3704875696637656, "rougeL_recall_stderr": 0.004602314366146881, "rougeLsum_fmeasure": 0.10844065292929861, "rougeLsum_fmeasure_stderr": 0.0015071351512198398, "rougeLsum_precision": 0.06783456627617492, "rougeLsum_precision_stderr": 0.001071886270802083, "rougeLsum_recall": 0.382465197548837, "rougeLsum_recall_stderr": 0.004842551860701479}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5993765093997294, "bleu_stderr": 0.07506085369601702, "rouge1_fmeasure": 0.1779625720124565, "rouge1_fmeasure_stderr": 0.0018340890133845323, "rouge1_precision": 0.15182550911429873, "rouge1_precision_stderr": 0.0018688893384549484, "rouge1_recall": 0.2589242894485067, "rouge1_recall_stderr": 0.0026517724171458952, "rouge2_fmeasure": 0.036223836197762725, "rouge2_fmeasure_stderr": 0.0008360695449580615, "rouge2_precision": 0.030739869666078615, "rouge2_precision_stderr": 0.0007500707956482545, "rouge2_recall": 0.05422083314217582, "rouge2_recall_stderr": 0.0013617275020705638, "rougeL_fmeasure": 0.13797458766444107, "rougeL_fmeasure_stderr": 0.00128706736916593, "rougeL_precision": 0.1163914580615846, "rougeL_precision_stderr": 0.0012923332758585293, "rougeL_recall": 0.20559642048990517, "rougeL_recall_stderr": 0.0021063218448770372, "rougeLsum_fmeasure": 0.1625132579484488, "rougeLsum_fmeasure_stderr": 0.0016598158056400846, "rougeLsum_precision": 0.13832783036931234, "rougeLsum_precision_stderr": 0.0016858294437463758, "rougeLsum_recall": 0.23756775868205818, "rougeLsum_recall_stderr": 0.002450945116586128}}, "1": {"tldr_en": {"bleu": 2.5200403089231775, "bleu_stderr": 0.05693901717280776, "rouge1_fmeasure": 0.21598714377615563, "rouge1_fmeasure_stderr": 0.0019024221215778328, "rouge1_precision": 0.18496096563065467, "rouge1_precision_stderr": 0.002050002215955609, "rouge1_recall": 0.3135599264124284, "rouge1_recall_stderr": 0.0027341763298141864, "rouge2_fmeasure": 0.05092387173534868, "rouge2_fmeasure_stderr": 0.0009502922893863336, "rouge2_precision": 0.043438836648193035, "rouge2_precision_stderr": 0.0008698319584098167, "rouge2_recall": 0.07611495800976509, "rouge2_recall_stderr": 0.0016061241867093036, "rougeL_fmeasure": 0.15160062635446855, "rougeL_fmeasure_stderr": 0.001239572009665517, "rougeL_precision": 0.1281515256406243, "rougeL_precision_stderr": 0.001291169012274189, "rougeL_recall": 0.2264556395402032, "rougeL_recall_stderr": 0.0021416748555613593, "rougeLsum_fmeasure": 0.2019926405301408, "rougeLsum_fmeasure_stderr": 0.0017749314312316205, "rougeLsum_precision": 0.17280855254177024, "rougeLsum_precision_stderr": 0.0019099792075058242, "rougeLsum_recall": 0.2938664218975422, "rougeLsum_recall_stderr": 0.0025815210408538384}}, "2": {"tldr_en": {"bleu": 2.8246220069939314, "bleu_stderr": 0.05436929747000312, "rouge1_fmeasure": 0.22231924312892618, "rouge1_fmeasure_stderr": 0.0018757379527855871, "rouge1_precision": 0.19241271397106163, "rouge1_precision_stderr": 0.002086863578689543, "rouge1_recall": 0.32031046447154404, "rouge1_recall_stderr": 0.0027181669208385484, "rouge2_fmeasure": 0.05579012272313604, "rouge2_fmeasure_stderr": 0.0010124966408794026, "rouge2_precision": 0.048019665884255235, "rouge2_precision_stderr": 0.000949724643152317, "rouge2_recall": 0.0836751095859514, "rouge2_recall_stderr": 0.0017264035814644186, "rougeL_fmeasure": 0.15627127285797618, "rougeL_fmeasure_stderr": 0.0012523054332758818, "rougeL_precision": 0.13376141964932425, "rougeL_precision_stderr": 0.0013725108527129226, "rougeL_recall": 0.23124667730229143, "rougeL_recall_stderr": 0.0021820014245572915, "rougeLsum_fmeasure": 0.20921146077018363, "rougeLsum_fmeasure_stderr": 0.001755514182451968, "rougeLsum_precision": 0.18081556496647747, "rougeLsum_precision_stderr": 0.0019520981988559959, "rougeLsum_recall": 0.3025117234925768, "rougeLsum_recall_stderr": 0.0026055764102078075}}, "3": {"tldr_en": {"bleu": 2.775126184527832, "bleu_stderr": 0.04964262533072551, "rouge1_fmeasure": 0.1848100615479409, "rouge1_fmeasure_stderr": 0.0022579690258614704, "rouge1_precision": 0.16640193920440474, "rouge1_precision_stderr": 0.002472882187893366, "rouge1_recall": 0.2649663905434579, "rouge1_recall_stderr": 0.0032993991901049875, "rouge2_fmeasure": 0.04536872008122934, "rouge2_fmeasure_stderr": 0.001002873578150144, "rouge2_precision": 0.04051313387433241, "rouge2_precision_stderr": 0.001042560540352432, "rouge2_recall": 0.06748612349747192, "rouge2_recall_stderr": 0.0016580523507809426, "rougeL_fmeasure": 0.13129086818074612, "rougeL_fmeasure_stderr": 0.001545448883255699, "rougeL_precision": 0.11778833422399672, "rougeL_precision_stderr": 0.001764546075584022, "rougeL_recall": 0.19297885421052255, "rougeL_recall_stderr": 0.002524627264850371, "rougeLsum_fmeasure": 0.17351085580896572, "rougeLsum_fmeasure_stderr": 0.002116003523751111, "rougeLsum_precision": 0.1560147423913123, "rougeLsum_precision_stderr": 0.0023125754138923474, "rougeLsum_recall": 0.249482350470566, "rougeLsum_recall_stderr": 0.0031348571322463346}}, "4": {"tldr_en": {"bleu": 0.6328100673630783, "bleu_stderr": 0.039922860842299984, "rouge1_fmeasure": 0.05762199753523481, "rouge1_fmeasure_stderr": 0.0019234101466023426, "rouge1_precision": 0.05264325356647027, "rouge1_precision_stderr": 0.0019548941831179097, "rouge1_recall": 0.08637392317712084, "rouge1_recall_stderr": 0.0029446541600125875, "rouge2_fmeasure": 0.013932175935494801, "rouge2_fmeasure_stderr": 0.000677479417971868, "rouge2_precision": 0.012435473489888542, "rouge2_precision_stderr": 0.0006697583811981027, "rouge2_recall": 0.022066243551567184, "rouge2_recall_stderr": 0.001176386791377328, "rougeL_fmeasure": 0.042630350228712206, "rougeL_fmeasure_stderr": 0.0014055059931358702, "rougeL_precision": 0.038774748862737, "rougeL_precision_stderr": 0.0014187651157647537, "rougeL_recall": 0.06558722529459496, "rougeL_recall_stderr": 0.002298264781788869, "rougeLsum_fmeasure": 0.05397101577019339, "rougeLsum_fmeasure_stderr": 0.001795986537373056, "rougeLsum_precision": 0.049317168459700536, "rougeLsum_precision_stderr": 0.0018317279159510054, "rougeLsum_recall": 0.08119886671766297, "rougeLsum_recall_stderr": 0.002771971428999714}}, "5": {"tldr_en": {"bleu": 1.8294461842495436e-06, "bleu_stderr": 2.8960441784514307e-06, "rouge1_fmeasure": 0.009114245109778818, "rouge1_fmeasure_stderr": 0.0008561600546824989, "rouge1_precision": 0.008573781066321003, "rouge1_precision_stderr": 0.0008654762638624486, "rouge1_recall": 0.013672256166431493, "rouge1_recall_stderr": 0.0012985268683484443, "rouge2_fmeasure": 0.0021689783329266383, "rouge2_fmeasure_stderr": 0.00026415328416558304, "rouge2_precision": 0.001966748909838605, "rouge2_precision_stderr": 0.00025417327235094497, "rouge2_recall": 0.0033662241091160935, "rouge2_recall_stderr": 0.0004534618115491094, "rougeL_fmeasure": 0.006612705879560682, "rougeL_fmeasure_stderr": 0.0006186105809635613, "rougeL_precision": 0.006239534916325571, "rougeL_precision_stderr": 0.0006307296825652261, "rougeL_recall": 0.010041041492697021, "rougeL_recall_stderr": 0.0009608570474005339, "rougeLsum_fmeasure": 0.008378974054458118, "rougeLsum_fmeasure_stderr": 0.0007859574648450136, "rougeLsum_precision": 0.007874956167229618, "rougeLsum_precision_stderr": 0.0007946305678768076, "rougeLsum_recall": 0.01258508198697765, "rougeLsum_recall_stderr": 0.0011929631283971707}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 1.4134342817933134, "bleu_stderr": 0.10321165497606506, "rouge1_fmeasure": 0.07268079919042715, "rouge1_fmeasure_stderr": 0.0013299276492676378, "rouge1_precision": 0.10247313979693526, "rouge1_precision_stderr": 0.0016595089176992329, "rouge1_recall": 0.08798141485351638, "rouge1_recall_stderr": 0.0020421795807805876, "rouge2_fmeasure": 0.00993468622248673, "rouge2_fmeasure_stderr": 0.0005343002508124515, "rouge2_precision": 0.00887479842846684, "rouge2_precision_stderr": 0.0005272736841464594, "rouge2_recall": 0.014404059874404535, "rouge2_recall_stderr": 0.0007753598252854332, "rougeL_fmeasure": 0.07081998927780214, "rougeL_fmeasure_stderr": 0.0012624377157260348, "rougeL_precision": 0.10002794388883973, "rougeL_precision_stderr": 0.0015746721314263722, "rougeL_recall": 0.08578186838136309, "rougeL_recall_stderr": 0.001963617682885378, "rougeLsum_fmeasure": 0.06325547718223694, "rougeLsum_fmeasure_stderr": 0.001146593422118666, "rougeLsum_precision": 0.09505113511885002, "rougeLsum_precision_stderr": 0.0016104542116781076, "rougeLsum_recall": 0.07341402605300737, "rougeLsum_recall_stderr": 0.0017014954492227232}}, "1": {"generate_text_restaurant": {"bleu": 11.161361331881567, "bleu_stderr": 0.12372614680699576, "rouge1_fmeasure": 0.4325678229581062, "rouge1_fmeasure_stderr": 0.002292323583369995, "rouge1_precision": 0.5102128961431266, "rouge1_precision_stderr": 0.0032435467042979615, "rouge1_recall": 0.41996959404299955, "rouge1_recall_stderr": 0.003000739317309415, "rouge2_fmeasure": 0.19587297305804474, "rouge2_fmeasure_stderr": 0.0019046996832194681, "rouge2_precision": 0.2347486548964982, "rouge2_precision_stderr": 0.0025124030520577855, "rouge2_recall": 0.18995161561629698, "rouge2_recall_stderr": 0.002108346628324476, "rougeL_fmeasure": 0.3125033610005072, "rougeL_fmeasure_stderr": 0.001959745932298407, "rougeL_precision": 0.3718563664348661, "rougeL_precision_stderr": 0.0028475505595708522, "rougeL_recall": 0.302335969435588, "rougeL_recall_stderr": 0.002389554465547073, "rougeLsum_fmeasure": 0.35360222601109786, "rougeLsum_fmeasure_stderr": 0.002221056794276662, "rougeLsum_precision": 0.41834560442201607, "rougeLsum_precision_stderr": 0.0030773847535100003, "rougeLsum_recall": 0.3428712422996488, "rougeLsum_recall_stderr": 0.0027256967100605344}}, "2": {"generate_text_restaurant": {"bleu": 11.178478800336915, "bleu_stderr": 0.18720778468970048, "rouge1_fmeasure": 0.43564156295510353, "rouge1_fmeasure_stderr": 0.002087610230342593, "rouge1_precision": 0.4641458682797714, "rouge1_precision_stderr": 0.002971633166316723, "rouge1_recall": 0.45712689402679485, "rouge1_recall_stderr": 0.0028529123911751396, "rouge2_fmeasure": 0.20007291305638683, "rouge2_fmeasure_stderr": 0.0018220503474021836, "rouge2_precision": 0.21534769962339712, "rouge2_precision_stderr": 0.0023043178046714048, "rouge2_recall": 0.21108519516124935, "rouge2_recall_stderr": 0.002144830729960772, "rougeL_fmeasure": 0.3154129310627208, "rougeL_fmeasure_stderr": 0.0017997732274878523, "rougeL_precision": 0.3373737455708324, "rougeL_precision_stderr": 0.002541494576877095, "rougeL_recall": 0.3312050835060631, "rougeL_recall_stderr": 0.0023578769935104662, "rougeLsum_fmeasure": 0.36253839271187577, "rougeLsum_fmeasure_stderr": 0.0020879729234347427, "rougeLsum_precision": 0.38666807747150694, "rougeLsum_precision_stderr": 0.002816195982796137, "rougeLsum_recall": 0.3803128282949514, "rougeLsum_recall_stderr": 0.002684776810203093}}, "3": {"generate_text_restaurant": {"bleu": 11.078551573988406, "bleu_stderr": 0.1775440487553758, "rouge1_fmeasure": 0.4326675462538715, "rouge1_fmeasure_stderr": 0.0019582645124443902, "rouge1_precision": 0.43295946805897406, "rouge1_precision_stderr": 0.0024929063336741916, "rouge1_recall": 0.47117854558838085, "rouge1_recall_stderr": 0.0027664367541802696, "rouge2_fmeasure": 0.200583189798169, "rouge2_fmeasure_stderr": 0.0017572494683324837, "rouge2_precision": 0.2009303237548315, "rouge2_precision_stderr": 0.0019532985877392488, "rouge2_recall": 0.22075863091567335, "rouge2_recall_stderr": 0.0021918759824511784, "rougeL_fmeasure": 0.31563666321158207, "rougeL_fmeasure_stderr": 0.0017274457082887083, "rougeL_precision": 0.31641399110077373, "rougeL_precision_stderr": 0.002147128833979884, "rougeL_recall": 0.3443145885276902, "rougeL_recall_stderr": 0.002348509165132024, "rougeLsum_fmeasure": 0.36464056912358683, "rougeLsum_fmeasure_stderr": 0.002012766885114503, "rougeLsum_precision": 0.3650689748250584, "rougeLsum_precision_stderr": 0.0024222254606659755, "rougeLsum_recall": 0.3971452452707439, "rougeLsum_recall_stderr": 0.0026671950920676476}}, "4": {"generate_text_restaurant": {"bleu": 11.26779671250681, "bleu_stderr": 0.20945092499394355, "rouge1_fmeasure": 0.43468339695150743, "rouge1_fmeasure_stderr": 0.0019192301545953678, "rouge1_precision": 0.42504057259498224, "rouge1_precision_stderr": 0.002302694388890242, "rouge1_recall": 0.479467443521395, "rouge1_recall_stderr": 0.002694693864079371, "rouge2_fmeasure": 0.202879451815592, "rouge2_fmeasure_stderr": 0.001765189527045149, "rouge2_precision": 0.19787815742992643, "rouge2_precision_stderr": 0.0018370754370625954, "rouge2_recall": 0.22626550198253664, "rouge2_recall_stderr": 0.002204792167524998, "rougeL_fmeasure": 0.3172808489106893, "rougeL_fmeasure_stderr": 0.0017201307043284524, "rougeL_precision": 0.31019836982770554, "rougeL_precision_stderr": 0.001959650847184396, "rougeL_recall": 0.3507681740649588, "rougeL_recall_stderr": 0.0023395609242667513, "rougeLsum_fmeasure": 0.36655599606622596, "rougeLsum_fmeasure_stderr": 0.0019897208220121514, "rougeLsum_precision": 0.3582038693968529, "rougeLsum_precision_stderr": 0.0022423131040152167, "rougeLsum_recall": 0.40461547051620306, "rougeLsum_recall_stderr": 0.0026339178059616598}}, "5": {"generate_text_restaurant": {"bleu": 11.056827999458564, "bleu_stderr": 0.18078644474517908, "rouge1_fmeasure": 0.4309560903845341, "rouge1_fmeasure_stderr": 0.001893293831351894, "rouge1_precision": 0.41637768269876485, "rouge1_precision_stderr": 0.002227557311413217, "rouge1_recall": 0.48001165743040636, "rouge1_recall_stderr": 0.0026762804898328154, "rouge2_fmeasure": 0.20083246520570572, "rouge2_fmeasure_stderr": 0.0017445182528093705, "rouge2_precision": 0.19341945111333012, "rouge2_precision_stderr": 0.0017901333419283593, "rouge2_recall": 0.22629146190496788, "rouge2_recall_stderr": 0.002191585425335327, "rougeL_fmeasure": 0.31519997823452967, "rougeL_fmeasure_stderr": 0.0017133598116412512, "rougeL_precision": 0.303929949160996, "rougeL_precision_stderr": 0.0018891083086755413, "rougeL_recall": 0.35261189373922003, "rougeL_recall_stderr": 0.0023727357497467937, "rougeLsum_fmeasure": 0.3646984779801211, "rougeLsum_fmeasure_stderr": 0.001960028955951922, "rougeLsum_precision": 0.35227968062569365, "rougeLsum_precision_stderr": 0.0021866137017689286, "rougeLsum_recall": 0.4064043196987044, "rougeLsum_recall_stderr": 0.002605112678915691}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.295517441700607, "bleu_stderr": 0.09915357248852676, "rouge1_fmeasure": 0.21824095908596577, "rouge1_fmeasure_stderr": 0.0025874062237003148, "rouge1_precision": 0.16347573034711765, "rouge1_precision_stderr": 0.002235052767052065, "rouge1_recall": 0.361809595461701, "rouge1_recall_stderr": 0.004450055018587551, "rouge2_fmeasure": 0.055096743612893836, "rouge2_fmeasure_stderr": 0.0017015215521108862, "rouge2_precision": 0.04080119463614899, "rouge2_precision_stderr": 0.0013586822504416986, "rouge2_recall": 0.09489684727762387, "rouge2_recall_stderr": 0.0030304863214365187, "rougeL_fmeasure": 0.1649602106468539, "rougeL_fmeasure_stderr": 0.0019703191216796398, "rougeL_precision": 0.12349304284569401, "rougeL_precision_stderr": 0.0017244637096469517, "rougeL_recall": 0.27540408956152623, "rougeL_recall_stderr": 0.0035538299823413846, "rougeLsum_fmeasure": 0.1710468546463645, "rougeLsum_fmeasure_stderr": 0.0022098555753275687, "rougeLsum_precision": 0.12787359610941754, "rougeLsum_precision_stderr": 0.0018576292031407871, "rougeLsum_recall": 0.2854470256837463, "rougeLsum_recall_stderr": 0.003942621790562841}}, "1": {"article_DOC_summary": {"bleu": 1.3192475830096537, "bleu_stderr": 0.07504584122763394, "rouge1_fmeasure": 0.1759444559759621, "rouge1_fmeasure_stderr": 0.0024983727013912576, "rouge1_precision": 0.12498263517558904, "rouge1_precision_stderr": 0.0018439932798174727, "rouge1_recall": 0.30997040065867815, "rouge1_recall_stderr": 0.004377559967707829, "rouge2_fmeasure": 0.03416151854233213, "rouge2_fmeasure_stderr": 0.0013809223175216116, "rouge2_precision": 0.023984343121594594, "rouge2_precision_stderr": 0.0009703036853821361, "rouge2_recall": 0.061937587800487796, "rouge2_recall_stderr": 0.0025773754792855217, "rougeL_fmeasure": 0.13659440083004393, "rougeL_fmeasure_stderr": 0.0018529344115148084, "rougeL_precision": 0.09683506809823249, "rougeL_precision_stderr": 0.0013556223253912536, "rougeL_recall": 0.24217304938796147, "rougeL_recall_stderr": 0.003372900627372106, "rougeLsum_fmeasure": 0.14008466520947307, "rougeLsum_fmeasure_stderr": 0.0020194523247221608, "rougeLsum_precision": 0.099266332557722, "rougeLsum_precision_stderr": 0.0014699947767357671, "rougeLsum_recall": 0.2484370722635638, "rougeLsum_recall_stderr": 0.0036565129132140007}}, "2": {"article_DOC_summary": {"bleu": 1.3941983933749893, "bleu_stderr": 0.07778092606300743, "rouge1_fmeasure": 0.17317479594583315, "rouge1_fmeasure_stderr": 0.0024306874343318695, "rouge1_precision": 0.12308471402675025, "rouge1_precision_stderr": 0.0017964024163926668, "rouge1_recall": 0.30420591170874123, "rouge1_recall_stderr": 0.0042091034345315345, "rouge2_fmeasure": 0.03548103421040685, "rouge2_fmeasure_stderr": 0.0013648086670444103, "rouge2_precision": 0.024998880277592218, "rouge2_precision_stderr": 0.0009660509776349401, "rouge2_recall": 0.06368455444982171, "rouge2_recall_stderr": 0.002497504245191044, "rougeL_fmeasure": 0.13747942038634384, "rougeL_fmeasure_stderr": 0.0018617507575533767, "rougeL_precision": 0.09766064805279387, "rougeL_precision_stderr": 0.0013763841420730348, "rougeL_recall": 0.24210898643660594, "rougeL_recall_stderr": 0.003269851529578185, "rougeLsum_fmeasure": 0.13676635773915868, "rougeLsum_fmeasure_stderr": 0.002012262737653545, "rougeLsum_precision": 0.09700451417856086, "rougeLsum_precision_stderr": 0.0014713838347837971, "rougeLsum_recall": 0.2416228650888391, "rougeLsum_recall_stderr": 0.003570749791914669}}, "3": {"article_DOC_summary": {"bleu": 1.447300262525524, "bleu_stderr": 0.0647434385662156, "rouge1_fmeasure": 0.17052015726603906, "rouge1_fmeasure_stderr": 0.0026006019517768233, "rouge1_precision": 0.1238798362783257, "rouge1_precision_stderr": 0.0020371925631676394, "rouge1_recall": 0.2938277668513655, "rouge1_recall_stderr": 0.004445484366807697, "rouge2_fmeasure": 0.03519128429327673, "rouge2_fmeasure_stderr": 0.0013864951253610754, "rouge2_precision": 0.025223685922742684, "rouge2_precision_stderr": 0.0010126897278031643, "rouge2_recall": 0.06217111485022419, "rouge2_recall_stderr": 0.0025204701650283143, "rougeL_fmeasure": 0.1359902514132361, "rougeL_fmeasure_stderr": 0.002000179300648128, "rougeL_precision": 0.09880564731360271, "rougeL_precision_stderr": 0.0015869573723169535, "rougeL_recall": 0.2348719293167949, "rougeL_recall_stderr": 0.0034616442328126336, "rougeLsum_fmeasure": 0.13420532702105284, "rougeLsum_fmeasure_stderr": 0.0020736765057759775, "rougeLsum_precision": 0.09736356730337334, "rougeLsum_precision_stderr": 0.0016253165748697476, "rougeLsum_recall": 0.23279761437533245, "rougeLsum_recall_stderr": 0.0036609997549228836}}, "4": {"article_DOC_summary": {"bleu": 0.6399540831907461, "bleu_stderr": 0.0665572258832638, "rouge1_fmeasure": 0.04587682793613896, "rouge1_fmeasure_stderr": 0.0026664431362576437, "rouge1_precision": 0.03916972701362441, "rouge1_precision_stderr": 0.0025232037599719035, "rouge1_recall": 0.0705187151734879, "rouge1_recall_stderr": 0.0041033703238095975, "rouge2_fmeasure": 0.008749872613988179, "rouge2_fmeasure_stderr": 0.0008910869873582503, "rouge2_precision": 0.007496872523694719, "rouge2_precision_stderr": 0.0009023223955939198, "rouge2_recall": 0.013768613253728622, "rouge2_recall_stderr": 0.0013735135943198867, "rougeL_fmeasure": 0.03641897598226805, "rougeL_fmeasure_stderr": 0.0020590368431718546, "rougeL_precision": 0.03154543945422246, "rougeL_precision_stderr": 0.0020528668246378276, "rougeL_recall": 0.05618648837737757, "rougeL_recall_stderr": 0.0032188263348742517, "rougeLsum_fmeasure": 0.036286088799465356, "rougeLsum_fmeasure_stderr": 0.002099169574521433, "rougeLsum_precision": 0.031542117455926555, "rougeLsum_precision_stderr": 0.0021071412880804725, "rougeLsum_recall": 0.055916896434064445, "rougeLsum_recall_stderr": 0.003269542426299847}}, "5": {"article_DOC_summary": {"bleu": 4.294601973766167e-36, "bleu_stderr": 7.391409466573723e-32, "rouge1_fmeasure": 0.002797377262876916, "rouge1_fmeasure_stderr": 0.0008436395309397021, "rouge1_precision": 0.0030262317078135763, "rouge1_precision_stderr": 0.0008861957572341574, "rouge1_recall": 0.002682824319713878, "rouge1_recall_stderr": 0.0008349754331976099, "rouge2_fmeasure": 0.0005852654722501501, "rouge2_fmeasure_stderr": 0.00037583234587724155, "rouge2_precision": 0.0005864695792553728, "rouge2_precision_stderr": 0.0003600452195293311, "rouge2_recall": 0.0005925913708932577, "rouge2_recall_stderr": 0.00039524060846512425, "rougeL_fmeasure": 0.002169100825975142, "rougeL_fmeasure_stderr": 0.0006545600161608136, "rougeL_precision": 0.0023057883289935395, "rougeL_precision_stderr": 0.0006706823613650493, "rougeL_recall": 0.002115155017368573, "rougeL_recall_stderr": 0.0006595785512806381, "rougeLsum_fmeasure": 0.002243893464585761, "rougeLsum_fmeasure_stderr": 0.0006801725997564016, "rougeLsum_precision": 0.00240202126410433, "rougeLsum_precision_stderr": 0.0007072328465422081, "rougeLsum_recall": 0.0021726632985610644, "rougeLsum_recall_stderr": 0.0006770960348507971}}}} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..44bb3014dfaa3b4fc46f3edb6fe27a9cafa9a033 --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.318,0.014734079309311901,0 +anli_r2,acc,0.331,0.01488827258820394,0 +anli_r3,acc,0.33916666666666667,0.013672343491681808,0 +arc_challenge,acc,0.28498293515358364,0.013191348179838793,0 +arc_challenge,acc_norm,0.30204778156996587,0.013417519144716422,0 +arc_easy,acc,0.6077441077441077,0.010018744689650043,0 +arc_easy,acc_norm,0.539983164983165,0.010226927233491506,0 +boolq,acc,0.5501529051987768,0.008700950643028798,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.1986111111111111,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.47649870543716394,0.004984266543053125,0 +hellaswag,acc_norm,0.6253734315873332,0.004830371317841073,0 +piqa,acc,0.7519042437431991,0.010077118315574719,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267314,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.849,0.011328165223341671,0 +sciq,acc_norm,0.758,0.013550631705555956,0 +storycloze_2016,acc,0.7226082308925709,0.010353267472010765,0 +winogrande,acc,0.5753749013417522,0.01389189315026423,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json deleted file mode 100644 index 1ec971905a50ff4b07dc4ed5108c1d2c5f09cf6e..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.318, - "acc_stderr": 0.014734079309311901 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.01488827258820394 - }, - "anli_r3": { - "acc": 0.33916666666666667, - "acc_stderr": 0.013672343491681808 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.1986111111111111 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.47649870543716394, - "acc_stderr": 0.004984266543053125, - "acc_norm": 0.6253734315873332, - "acc_norm_stderr": 0.004830371317841073 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5753749013417522, - "acc_stderr": 0.01389189315026423 - }, - "storycloze_2016": { - "acc": 0.7226082308925709, - "acc_stderr": 0.010353267472010765 - }, - "boolq": { - "acc": 0.5501529051987768, - "acc_stderr": 0.008700950643028798 - }, - "arc_easy": { - "acc": 0.6077441077441077, - "acc_stderr": 0.010018744689650043, - "acc_norm": 0.539983164983165, - "acc_norm_stderr": 0.010226927233491506 - }, - "arc_challenge": { - "acc": 0.28498293515358364, - "acc_stderr": 0.013191348179838793, - "acc_norm": 0.30204778156996587, - "acc_norm_stderr": 0.013417519144716422 - }, - "sciq": { - "acc": 0.849, - "acc_stderr": 0.011328165223341671, - "acc_norm": 0.758, - "acc_norm_stderr": 0.013550631705555956 - }, - "piqa": { - "acc": 0.7519042437431991, - "acc_stderr": 0.010077118315574719, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267314 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..2eabbba28019e7aeb3beea0339208cddbf856ebe --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732953,0 +anli_r2,acc,0.321,0.01477082181793464,0 +anli_r3,acc,0.3408333333333333,0.013688600793296936,0 +arc_challenge,acc,0.29692832764505117,0.013352025976725225,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623504,0 +arc_easy,acc,0.6224747474747475,0.009947227833469432,0 +arc_easy,acc_norm,0.5686026936026936,0.01016275284774751,0 +boolq,acc,0.6305810397553517,0.008441557531799619,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.3390804597701149,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.4743079067914758,0.004983189711208521,0 +hellaswag,acc_norm,0.625273849830711,0.004830628620181016,0 +piqa,acc,0.7480957562568009,0.010128421335088683,0 +piqa,acc_norm,0.7562568008705114,0.010017199471500609,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.889,0.009938701010583726,0 +sciq,acc_norm,0.867,0.010743669132397337,0 +storycloze_2016,acc,0.7113842864778194,0.01047831178564294,0 +winogrande,acc,0.5722178374112076,0.01390513401383995,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json deleted file mode 100644 index 8c1c302161fdeb86fd0aef761677c38617f14ca2..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732953 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.01477082181793464 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.013688600793296936 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.3390804597701149 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.4743079067914758, - "acc_stderr": 0.004983189711208521, - "acc_norm": 0.625273849830711, - "acc_norm_stderr": 0.004830628620181016 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.5722178374112076, - "acc_stderr": 0.01390513401383995 - }, - "storycloze_2016": { - "acc": 0.7113842864778194, - "acc_stderr": 0.01047831178564294 - }, - "boolq": { - "acc": 0.6305810397553517, - "acc_stderr": 0.008441557531799619 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.009947227833469432, - "acc_norm": 0.5686026936026936, - "acc_norm_stderr": 0.01016275284774751 - }, - "arc_challenge": { - "acc": 0.29692832764505117, - "acc_stderr": 0.013352025976725225, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623504 - }, - "sciq": { - "acc": 0.889, - "acc_stderr": 0.009938701010583726, - "acc_norm": 0.867, - "acc_norm_stderr": 0.010743669132397337 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088683, - "acc_norm": 0.7562568008705114, - "acc_norm_stderr": 0.010017199471500609 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..d73926ac7203c9ba0c46661686ca6e9647284987 --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.315,0.0146966319607925,0 +anli_r2,acc,0.345,0.015039986742055237,0 +anli_r3,acc,0.3175,0.013443538681348054,0 +arc_challenge,acc,0.30204778156996587,0.01341751914471642,0 +arc_challenge,acc_norm,0.3242320819112628,0.01367881039951882,0 +arc_easy,acc,0.6174242424242424,0.009972837790531477,0 +arc_easy,acc_norm,0.5942760942760943,0.01007575554012888,0 +boolq,acc,0.6339449541284403,0.008425419107728748,1 +cb,acc,0.17857142857142858,0.05164277182008721,1 +cb,f1,0.16728395061728393,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.4751045608444533,0.004983592410934173,0 +hellaswag,acc_norm,0.6286596295558654,0.004821757734156732,0 +piqa,acc,0.749727965179543,0.010106561880089782,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858922,0 +rte,acc,0.4981949458483754,0.030096267148976633,0 +sciq,acc,0.903,0.00936368937324811,0 +sciq,acc_norm,0.88,0.010281328012747386,0 +storycloze_2016,acc,0.7188669160876536,0.010395836091628112,0 +winogrande,acc,0.5927387529597474,0.013808654122417845,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json deleted file mode 100644 index 46d433470bc966dba154e648c3e02fa10576b93a..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.315, - "acc_stderr": 0.0146966319607925 - }, - "anli_r2": { - "acc": 0.345, - "acc_stderr": 0.015039986742055237 - }, - "anli_r3": { - "acc": 0.3175, - "acc_stderr": 0.013443538681348054 - }, - "cb": { - "acc": 0.17857142857142858, - "acc_stderr": 0.05164277182008721, - "f1": 0.16728395061728393 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.4751045608444533, - "acc_stderr": 0.004983592410934173, - "acc_norm": 0.6286596295558654, - "acc_norm_stderr": 0.004821757734156732 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.5927387529597474, - "acc_stderr": 0.013808654122417845 - }, - "storycloze_2016": { - "acc": 0.7188669160876536, - "acc_stderr": 0.010395836091628112 - }, - "boolq": { - "acc": 0.6339449541284403, - "acc_stderr": 0.008425419107728748 - }, - "arc_easy": { - "acc": 0.6174242424242424, - "acc_stderr": 0.009972837790531477, - "acc_norm": 0.5942760942760943, - "acc_norm_stderr": 0.01007575554012888 - }, - "arc_challenge": { - "acc": 0.30204778156996587, - "acc_stderr": 0.01341751914471642, - "acc_norm": 0.3242320819112628, - "acc_norm_stderr": 0.01367881039951882 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.00936368937324811, - "acc_norm": 0.88, - "acc_norm_stderr": 0.010281328012747386 - }, - "piqa": { - "acc": 0.749727965179543, - "acc_stderr": 0.010106561880089782, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858922 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..770b3a679cc6ad123c75dbb12144449e72727d2e --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.01488827258820394,0 +anli_r2,acc,0.341,0.0149981313484027,0 +anli_r3,acc,0.34,0.0136804957257678,0 +arc_challenge,acc,0.3037542662116041,0.01343890918477875,0 +arc_challenge,acc_norm,0.3319112627986348,0.013760988200880538,0 +arc_easy,acc,0.625,0.009933992677987828,0 +arc_easy,acc_norm,0.5984848484848485,0.010058790020755562,0 +boolq,acc,0.6278287461773701,0.008454434247373908,1 +cb,acc,0.2857142857142857,0.060914490387317256,1 +cb,f1,0.2849772788024592,,1 +copa,acc,0.8,0.04020151261036843,0 +hellaswag,acc,0.47450707030472017,0.00498329157828904,0 +hellaswag,acc_norm,0.6297550288787094,0.004818833521340352,0 +piqa,acc,0.7486398258977149,0.010121156016819257,0 +piqa,acc_norm,0.7665941240478781,0.009869247889520998,0 +rte,acc,0.49458483754512633,0.03009469812323996,0 +sciq,acc,0.901,0.009449248027662751,0 +sciq,acc_norm,0.886,0.010055103435823332,0 +storycloze_2016,acc,0.7177979690005345,0.01040783447964767,0 +winogrande,acc,0.5761641673243884,0.013888492389944511,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json deleted file mode 100644 index 7d327a82c41d33f5a6312aecd06e4bf6d8152b18..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.01488827258820394 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.0136804957257678 - }, - "cb": { - "acc": 0.2857142857142857, - "acc_stderr": 0.060914490387317256, - "f1": 0.2849772788024592 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036843 - }, - "hellaswag": { - "acc": 0.47450707030472017, - "acc_stderr": 0.00498329157828904, - "acc_norm": 0.6297550288787094, - "acc_norm_stderr": 0.004818833521340352 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.03009469812323996 - }, - "winogrande": { - "acc": 0.5761641673243884, - "acc_stderr": 0.013888492389944511 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.01040783447964767 - }, - "boolq": { - "acc": 0.6278287461773701, - "acc_stderr": 0.008454434247373908 - }, - "arc_easy": { - "acc": 0.625, - "acc_stderr": 0.009933992677987828, - "acc_norm": 0.5984848484848485, - "acc_norm_stderr": 0.010058790020755562 - }, - "arc_challenge": { - "acc": 0.3037542662116041, - "acc_stderr": 0.01343890918477875, - "acc_norm": 0.3319112627986348, - "acc_norm_stderr": 0.013760988200880538 - }, - "sciq": { - "acc": 0.901, - "acc_stderr": 0.009449248027662751, - "acc_norm": 0.886, - "acc_norm_stderr": 0.010055103435823332 - }, - "piqa": { - "acc": 0.7486398258977149, - "acc_stderr": 0.010121156016819257, - "acc_norm": 0.7665941240478781, - "acc_norm_stderr": 0.009869247889520998 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..d40d6eea5d2132898f7ab8a7fa4e3c4be7d5b819 --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095526,0 +anli_r2,acc,0.335,0.014933117490932577,0 +anli_r3,acc,0.3383333333333333,0.013664144006618268,0 +arc_challenge,acc,0.3037542662116041,0.013438909184778759,0 +arc_challenge,acc_norm,0.3267918088737201,0.013706665975587338,0 +arc_easy,acc,0.6321548821548821,0.009894923464455191,0 +arc_easy,acc_norm,0.6077441077441077,0.010018744689650043,0 +boolq,acc,0.6284403669724771,0.008451598145076598,1 +cb,acc,0.21428571428571427,0.055328333517248834,1 +cb,f1,0.20694283133307526,,1 +copa,acc,0.77,0.042295258468165065,0 +hellaswag,acc,0.4749053973312089,0.004983492928102842,0 +hellaswag,acc_norm,0.6303525194184425,0.004817227292240292,0 +piqa,acc,0.7535364526659413,0.010054810789671824,0 +piqa,acc_norm,0.7665941240478781,0.009869247889520998,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.911,0.009008893392651523,0 +sciq,acc_norm,0.891,0.009859828407037186,0 +storycloze_2016,acc,0.7268840192410476,0.010303512765124683,0 +winogrande,acc,0.5951065509076559,0.013795927003124934,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json deleted file mode 100644 index a495c3d7cb4be80b0fbf74f463206b96398d1c97..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.335, - "acc_stderr": 0.014933117490932577 - }, - "anli_r3": { - "acc": 0.3383333333333333, - "acc_stderr": 0.013664144006618268 - }, - "cb": { - "acc": 0.21428571428571427, - "acc_stderr": 0.055328333517248834, - "f1": 0.20694283133307526 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.042295258468165065 - }, - "hellaswag": { - "acc": 0.4749053973312089, - "acc_stderr": 0.004983492928102842, - "acc_norm": 0.6303525194184425, - "acc_norm_stderr": 0.004817227292240292 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5951065509076559, - "acc_stderr": 0.013795927003124934 - }, - "storycloze_2016": { - "acc": 0.7268840192410476, - "acc_stderr": 0.010303512765124683 - }, - "boolq": { - "acc": 0.6284403669724771, - "acc_stderr": 0.008451598145076598 - }, - "arc_easy": { - "acc": 0.6321548821548821, - "acc_stderr": 0.009894923464455191, - "acc_norm": 0.6077441077441077, - "acc_norm_stderr": 0.010018744689650043 - }, - "arc_challenge": { - "acc": 0.3037542662116041, - "acc_stderr": 0.013438909184778759, - "acc_norm": 0.3267918088737201, - "acc_norm_stderr": 0.013706665975587338 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651523, - "acc_norm": 0.891, - "acc_norm_stderr": 0.009859828407037186 - }, - "piqa": { - "acc": 0.7535364526659413, - "acc_stderr": 0.010054810789671824, - "acc_norm": 0.7665941240478781, - "acc_norm_stderr": 0.009869247889520998 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5.csv b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..a2bfc2caffee4622984c328a567ff6414cd3059f --- /dev/null +++ b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.014876872027456729,0 +anli_r2,acc,0.329,0.014865395385928355,0 +anli_r3,acc,0.33166666666666667,0.013596836729485171,0 +arc_challenge,acc,0.32337883959044367,0.013669421630012129,0 +arc_challenge,acc_norm,0.3199658703071672,0.013631345807016195,0 +arc_easy,acc,0.6325757575757576,0.009892552616211555,0 +arc_easy,acc_norm,0.617003367003367,0.00997492038453649,0 +boolq,acc,0.6321100917431193,0.008434276591093021,1 +cb,acc,0.19642857142857142,0.05357142857142859,1 +cb,f1,0.19293024227234748,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4727145986855208,0.004982346155911131,0 +hellaswag,acc_norm,0.6357299342760406,0.0048024139199326545,0 +piqa,acc,0.7453754080522307,0.010164432237060489,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.5379061371841155,0.030009848912529113,0 +sciq,acc,0.91,0.009054390204866439,0 +sciq,acc_norm,0.898,0.009575368801653873,0 +storycloze_2016,acc,0.7231427044361304,0.01034711289027693,0 +winogrande,acc,0.5974743488555643,0.013782866831703044,0 diff --git a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json b/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json deleted file mode 100644 index c743f194a5a743a7ac9262648a36853868505f57..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed2/evaluation/rankeval/4b284b28bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.014876872027456729 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928355 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.013596836729485171 - }, - "cb": { - "acc": 0.19642857142857142, - "acc_stderr": 0.05357142857142859, - "f1": 0.19293024227234748 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4727145986855208, - "acc_stderr": 0.004982346155911131, - "acc_norm": 0.6357299342760406, - "acc_norm_stderr": 0.0048024139199326545 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.013782866831703044 - }, - "storycloze_2016": { - "acc": 0.7231427044361304, - "acc_stderr": 0.01034711289027693 - }, - "boolq": { - "acc": 0.6321100917431193, - "acc_stderr": 0.008434276591093021 - }, - "arc_easy": { - "acc": 0.6325757575757576, - "acc_stderr": 0.009892552616211555, - "acc_norm": 0.617003367003367, - "acc_norm_stderr": 0.00997492038453649 - }, - "arc_challenge": { - "acc": 0.32337883959044367, - "acc_stderr": 0.013669421630012129, - "acc_norm": 0.3199658703071672, - "acc_norm_stderr": 0.013631345807016195 - }, - "sciq": { - "acc": 0.91, - "acc_stderr": 0.009054390204866439, - "acc_norm": 0.898, - "acc_norm_stderr": 0.009575368801653873 - }, - "piqa": { - "acc": 0.7453754080522307, - "acc_stderr": 0.010164432237060489, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/generation/merged.csv b/4b284b28bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..009506c7b39330f48ed1543ddc448326467bb982 --- /dev/null +++ b/4b284b28bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0750677147834178 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0750677147834178 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.14986852530881986 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.14986852530881986 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.181238613552319 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.181238613552319 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.18974320403636494 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.18974320403636494 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19376219920647275 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19376219920647275 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19587324037834244 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19587324037834244 +e2e_nlg_cleaned,5,average,multiple,0.16425891621095612 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04399744295623738 +gem_xsum,0,median,rouge2_fmeasure,0.04399744295623738 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03148436075040743 +gem_xsum,1,median,rouge2_fmeasure,0.03148436075040743 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.031093372281268725 +gem_xsum,2,median,rouge2_fmeasure,0.031093372281268725 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03135863836588418 +gem_xsum,3,median,rouge2_fmeasure,0.03135863836588418 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009627528506364348 +gem_xsum,4,median,rouge2_fmeasure,0.009627528506364348 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0004511007812344588 +gem_xsum,5,median,rouge2_fmeasure,0.0004511007812344588 +gem_xsum,5,average,multiple,0.024668740606899418 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05030332747327783 +web_nlg_en,0,median,rouge2_fmeasure,0.05030332747327783 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051085906931171744 +web_nlg_en,1,median,rouge2_fmeasure,0.051085906931171744 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.052438296552281494 +web_nlg_en,2,median,rouge2_fmeasure,0.052438296552281494 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.053701976919488074 +web_nlg_en,3,median,rouge2_fmeasure,0.053701976919488074 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05408499700220169 +web_nlg_en,4,median,rouge2_fmeasure,0.05408499700220169 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054252618076757034 +web_nlg_en,5,median,rouge2_fmeasure,0.054252618076757034 +web_nlg_en,5,average,multiple,0.05264452049252964 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.027536298312222936 +wiki_lingua_en,0,median,rouge2_fmeasure,0.027536298312222936 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04692285315751292 +wiki_lingua_en,1,median,rouge2_fmeasure,0.04692285315751292 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05192650547465838 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05192650547465838 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.045243645176363284 +wiki_lingua_en,3,median,rouge2_fmeasure,0.045243645176363284 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014905869113824048 +wiki_lingua_en,4,median,rouge2_fmeasure,0.014905869113824048 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0023077580857873777 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0023077580857873777 +wiki_lingua_en,5,average,multiple,0.03147382155339482 diff --git a/4b284b28bc4seed3/evaluation/generation/merged.json b/4b284b28bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..e9aa522f9652b60b05e0b63e794ea083f43fab26 --- /dev/null +++ b/4b284b28bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.35092030525579987, "bleu_stderr": 0.04513752019671635, "rouge1_fmeasure": 0.10668665131573762, "rouge1_fmeasure_stderr": 0.0019963304628015832, "rouge1_precision": 0.06981936667408743, "rouge1_precision_stderr": 0.001473328613173937, "rouge1_recall": 0.2898378467591775, "rouge1_recall_stderr": 0.004420681682062187, "rouge2_fmeasure": 0.05030332747327783, "rouge2_fmeasure_stderr": 0.001248625720436871, "rouge2_precision": 0.032790254442078545, "rouge2_precision_stderr": 0.0009004212723231938, "rouge2_recall": 0.14110910701776389, "rouge2_recall_stderr": 0.003077172133370463, "rougeL_fmeasure": 0.10309579270491318, "rougeL_fmeasure_stderr": 0.0018586300746194161, "rougeL_precision": 0.06724512679042116, "rougeL_precision_stderr": 0.0013469997217561611, "rougeL_recall": 0.2826819906334472, "rougeL_recall_stderr": 0.004340393008834924, "rougeLsum_fmeasure": 0.10235320219135983, "rougeLsum_fmeasure_stderr": 0.0018735247246252473, "rougeLsum_precision": 0.06690927840170163, "rougeLsum_precision_stderr": 0.0013776966657478994, "rougeLsum_recall": 0.27937547516216105, "rougeLsum_recall_stderr": 0.004232594249716667}}, "1": {"PALM_prompt": {"bleu": 0.4887135545339655, "bleu_stderr": 0.038207624916546064, "rouge1_fmeasure": 0.11005350518870848, "rouge1_fmeasure_stderr": 0.001863018182834003, "rouge1_precision": 0.07047823375611638, "rouge1_precision_stderr": 0.0013629185289183434, "rouge1_recall": 0.352113375646551, "rouge1_recall_stderr": 0.005212176176263606, "rouge2_fmeasure": 0.051085906931171744, "rouge2_fmeasure_stderr": 0.0011718163430073354, "rouge2_precision": 0.032678708300078345, "rouge2_precision_stderr": 0.0008350705039830835, "rouge2_recall": 0.17036081855757026, "rouge2_recall_stderr": 0.0036533260651395675, "rougeL_fmeasure": 0.10321588752678051, "rougeL_fmeasure_stderr": 0.001659748600170577, "rougeL_precision": 0.06602243378358179, "rougeL_precision_stderr": 0.0012047343606144839, "rougeL_recall": 0.3299025252012076, "rougeL_recall_stderr": 0.004711883556335918, "rougeLsum_fmeasure": 0.10490474504796687, "rougeLsum_fmeasure_stderr": 0.0017630576769512532, "rougeLsum_precision": 0.06724334284661879, "rougeLsum_precision_stderr": 0.0012905898446151749, "rougeLsum_recall": 0.3335950480627132, "rougeLsum_recall_stderr": 0.004795979080397771}}, "2": {"PALM_prompt": {"bleu": 0.4954577156236074, "bleu_stderr": 0.03940656960812993, "rouge1_fmeasure": 0.11261142270012729, "rouge1_fmeasure_stderr": 0.0017357579521792847, "rouge1_precision": 0.0714866607573638, "rouge1_precision_stderr": 0.0012477048923611861, "rouge1_recall": 0.366704464940973, "rouge1_recall_stderr": 0.005105137507748489, "rouge2_fmeasure": 0.052438296552281494, "rouge2_fmeasure_stderr": 0.0011033633061244077, "rouge2_precision": 0.03311033995538127, "rouge2_precision_stderr": 0.0007648297655042665, "rouge2_recall": 0.18155051913557482, "rouge2_recall_stderr": 0.00371730981090646, "rougeL_fmeasure": 0.10537631476024473, "rougeL_fmeasure_stderr": 0.001548032182806995, "rougeL_precision": 0.06687610955905898, "rougeL_precision_stderr": 0.0011120004143352184, "rougeL_recall": 0.3422071101265069, "rougeL_recall_stderr": 0.004593456998282074, "rougeLsum_fmeasure": 0.1071828571812226, "rougeLsum_fmeasure_stderr": 0.00163276079489414, "rougeLsum_precision": 0.06807059659117397, "rougeLsum_precision_stderr": 0.0011745555579416242, "rougeLsum_recall": 0.3484145276857689, "rougeLsum_recall_stderr": 0.004752677986072459}}, "3": {"PALM_prompt": {"bleu": 0.5376401397749783, "bleu_stderr": 0.042953510015901336, "rouge1_fmeasure": 0.11391880759717438, "rouge1_fmeasure_stderr": 0.0017214645061344748, "rouge1_precision": 0.07213772505487824, "rouge1_precision_stderr": 0.0012404738886852833, "rouge1_recall": 0.3776942956991359, "rouge1_recall_stderr": 0.00518895191365285, "rouge2_fmeasure": 0.053701976919488074, "rouge2_fmeasure_stderr": 0.0011062693098051102, "rouge2_precision": 0.033831854491709834, "rouge2_precision_stderr": 0.000767112875303506, "rouge2_recall": 0.18902052205137462, "rouge2_recall_stderr": 0.003745503144139708, "rougeL_fmeasure": 0.10636377632235701, "rougeL_fmeasure_stderr": 0.0015396374929510118, "rougeL_precision": 0.0673760809566409, "rougeL_precision_stderr": 0.0011092327236371367, "rougeL_recall": 0.3510488403621821, "rougeL_recall_stderr": 0.0046683703819187594, "rougeLsum_fmeasure": 0.10848385182183909, "rougeLsum_fmeasure_stderr": 0.0016325541592215193, "rougeLsum_precision": 0.06876714809423898, "rougeLsum_precision_stderr": 0.001180603104206542, "rougeLsum_recall": 0.3584958814094978, "rougeLsum_recall_stderr": 0.004825789898873508}}, "4": {"PALM_prompt": {"bleu": 0.5162458189892425, "bleu_stderr": 0.04567561479003356, "rouge1_fmeasure": 0.11538388270081333, "rouge1_fmeasure_stderr": 0.0017082303946239072, "rouge1_precision": 0.07313568376833962, "rouge1_precision_stderr": 0.0012433254028699097, "rouge1_recall": 0.3808222176442103, "rouge1_recall_stderr": 0.005017313267908712, "rouge2_fmeasure": 0.05408499700220169, "rouge2_fmeasure_stderr": 0.0010803111391284627, "rouge2_precision": 0.03409652402405422, "rouge2_precision_stderr": 0.0007575603665210839, "rouge2_recall": 0.1913005341448632, "rouge2_recall_stderr": 0.003690388734385289, "rougeL_fmeasure": 0.10721312824281004, "rougeL_fmeasure_stderr": 0.0015059136603345965, "rougeL_precision": 0.06801047825803394, "rougeL_precision_stderr": 0.0011036108321794546, "rougeL_recall": 0.35323488966561345, "rougeL_recall_stderr": 0.004471460082976139, "rougeLsum_fmeasure": 0.10984711639426804, "rougeLsum_fmeasure_stderr": 0.0016152114960555307, "rougeLsum_precision": 0.06970710378600671, "rougeLsum_precision_stderr": 0.0011816060505639649, "rougeLsum_recall": 0.3612576150943837, "rougeLsum_recall_stderr": 0.004665280252810779}}, "5": {"PALM_prompt": {"bleu": 0.5915390188723657, "bleu_stderr": 0.044999471378700744, "rouge1_fmeasure": 0.1156135546890491, "rouge1_fmeasure_stderr": 0.0016639879184558524, "rouge1_precision": 0.07308133165879348, "rouge1_precision_stderr": 0.0012280730859575018, "rouge1_recall": 0.39313349727864033, "rouge1_recall_stderr": 0.005208530742307819, "rouge2_fmeasure": 0.054252618076757034, "rouge2_fmeasure_stderr": 0.0010618703958199596, "rouge2_precision": 0.03409018556609139, "rouge2_precision_stderr": 0.000749916416836098, "rouge2_recall": 0.19987682746116572, "rouge2_recall_stderr": 0.003867392245272622, "rougeL_fmeasure": 0.10635472804850638, "rougeL_fmeasure_stderr": 0.001478213765313952, "rougeL_precision": 0.06736116168446518, "rougeL_precision_stderr": 0.0011057761206049632, "rougeL_recall": 0.3596279768743875, "rougeL_recall_stderr": 0.004551765129386944, "rougeLsum_fmeasure": 0.10968223642629092, "rougeLsum_fmeasure_stderr": 0.0015751819563328757, "rougeLsum_precision": 0.06946965842837552, "rougeLsum_precision_stderr": 0.0011735301458215054, "rougeLsum_recall": 0.37102628840990604, "rougeLsum_recall_stderr": 0.004786162392169987}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.2222578150102583, "bleu_stderr": 0.06328685065190241, "rouge1_fmeasure": 0.1471218733322546, "rouge1_fmeasure_stderr": 0.0019086674422868044, "rouge1_precision": 0.12728317987175145, "rouge1_precision_stderr": 0.0019543197768273073, "rouge1_recall": 0.21340145861239856, "rouge1_recall_stderr": 0.0027771001830022594, "rouge2_fmeasure": 0.027536298312222936, "rouge2_fmeasure_stderr": 0.0007720385992298698, "rouge2_precision": 0.023495678323026473, "rouge2_precision_stderr": 0.0007149798061630863, "rouge2_recall": 0.04189401531194056, "rouge2_recall_stderr": 0.0013277090457311043, "rougeL_fmeasure": 0.11779028020448569, "rougeL_fmeasure_stderr": 0.001400602568948609, "rougeL_precision": 0.10103719194551143, "rougeL_precision_stderr": 0.0014559595916573205, "rougeL_recall": 0.1749892611099076, "rougeL_recall_stderr": 0.0022848816135267102, "rougeLsum_fmeasure": 0.13579115205056982, "rougeLsum_fmeasure_stderr": 0.0017452654195332434, "rougeLsum_precision": 0.1174405809942993, "rougeLsum_precision_stderr": 0.001802707885416322, "rougeLsum_recall": 0.19766216246005014, "rougeLsum_recall_stderr": 0.00258101670904122}}, "1": {"tldr_en": {"bleu": 2.391147100552601, "bleu_stderr": 0.04006224158757559, "rouge1_fmeasure": 0.19732650210441122, "rouge1_fmeasure_stderr": 0.0020510877426877347, "rouge1_precision": 0.1736334455823323, "rouge1_precision_stderr": 0.0022976987647117656, "rouge1_recall": 0.28354394939217903, "rouge1_recall_stderr": 0.0029641856324906304, "rouge2_fmeasure": 0.04692285315751292, "rouge2_fmeasure_stderr": 0.0010196597076878576, "rouge2_precision": 0.04141351020002613, "rouge2_precision_stderr": 0.001065936730790372, "rouge2_recall": 0.0699662145493009, "rouge2_recall_stderr": 0.0016954247431350791, "rougeL_fmeasure": 0.14379398280935196, "rougeL_fmeasure_stderr": 0.001412248102591144, "rougeL_precision": 0.12577530969575892, "rougeL_precision_stderr": 0.0016573447824456509, "rougeL_recall": 0.2115227804388362, "rougeL_recall_stderr": 0.0023043951894502297, "rougeLsum_fmeasure": 0.18431976378633813, "rougeLsum_fmeasure_stderr": 0.001921122990992563, "rougeLsum_precision": 0.16211598975071775, "rougeLsum_precision_stderr": 0.002164725932714466, "rougeLsum_recall": 0.2656809680060299, "rougeLsum_recall_stderr": 0.002812085736561483}}, "2": {"tldr_en": {"bleu": 2.7171126284576332, "bleu_stderr": 0.04815677557686793, "rouge1_fmeasure": 0.2084266165739667, "rouge1_fmeasure_stderr": 0.001999753547385771, "rouge1_precision": 0.19459311097597148, "rouge1_precision_stderr": 0.002498974869930833, "rouge1_recall": 0.28833446397621476, "rouge1_recall_stderr": 0.002863676862389607, "rouge2_fmeasure": 0.05192650547465838, "rouge2_fmeasure_stderr": 0.0010701161009785201, "rouge2_precision": 0.050412113693174034, "rouge2_precision_stderr": 0.001351379848487111, "rouge2_recall": 0.07329394037381169, "rouge2_recall_stderr": 0.0016677245426515634, "rougeL_fmeasure": 0.15280735738097753, "rougeL_fmeasure_stderr": 0.0014043929947908011, "rougeL_precision": 0.1431722098886608, "rougeL_precision_stderr": 0.001914919098242861, "rougeL_recall": 0.2153296115668877, "rougeL_recall_stderr": 0.0022634798460424333, "rougeLsum_fmeasure": 0.1956803458833979, "rougeLsum_fmeasure_stderr": 0.001874943202214558, "rougeLsum_precision": 0.1826592257718392, "rougeLsum_precision_stderr": 0.0023613926909123446, "rougeLsum_recall": 0.27102904289356916, "rougeLsum_recall_stderr": 0.002697214827427601}}, "3": {"tldr_en": {"bleu": 2.7422220260434083, "bleu_stderr": 0.07994048977227425, "rouge1_fmeasure": 0.17822188610923453, "rouge1_fmeasure_stderr": 0.002313579710983267, "rouge1_precision": 0.1833771887413479, "rouge1_precision_stderr": 0.0030737417465454956, "rouge1_recall": 0.24146816828202433, "rouge1_recall_stderr": 0.0033746604530580163, "rouge2_fmeasure": 0.045243645176363284, "rouge2_fmeasure_stderr": 0.001053472542589495, "rouge2_precision": 0.04870214637199058, "rouge2_precision_stderr": 0.0015294386752092305, "rouge2_recall": 0.06300025230653579, "rouge2_recall_stderr": 0.001665082880499784, "rougeL_fmeasure": 0.131733000955773, "rougeL_fmeasure_stderr": 0.0016654922516170963, "rougeL_precision": 0.13827180021901367, "rougeL_precision_stderr": 0.0024999501979138853, "rougeL_recall": 0.18075620846507245, "rougeL_recall_stderr": 0.002606651906742854, "rougeLsum_fmeasure": 0.16787441794691876, "rougeLsum_fmeasure_stderr": 0.002180132432051263, "rougeLsum_precision": 0.17302447151119815, "rougeLsum_precision_stderr": 0.0029298205686312893, "rougeLsum_recall": 0.2277850010309598, "rougeLsum_recall_stderr": 0.003197183070245692}}, "4": {"tldr_en": {"bleu": 0.5925437754263059, "bleu_stderr": 0.04242065704358776, "rouge1_fmeasure": 0.05723884996198824, "rouge1_fmeasure_stderr": 0.0019771316422231024, "rouge1_precision": 0.06330716830934252, "rouge1_precision_stderr": 0.0025709225823679588, "rouge1_recall": 0.07845690859829274, "rouge1_recall_stderr": 0.002822969061122126, "rouge2_fmeasure": 0.014905869113824048, "rouge2_fmeasure_stderr": 0.0007403899631872741, "rouge2_precision": 0.017858459821703032, "rouge2_precision_stderr": 0.0012094921987221639, "rouge2_recall": 0.021543413757512866, "rouge2_recall_stderr": 0.0011877190270122848, "rougeL_fmeasure": 0.043631603403088595, "rougeL_fmeasure_stderr": 0.001500131477167622, "rougeL_precision": 0.0491984487986743, "rougeL_precision_stderr": 0.0020685970258627613, "rougeL_recall": 0.06047877336706885, "rougeL_recall_stderr": 0.002215110058371633, "rougeLsum_fmeasure": 0.05373718833644186, "rougeLsum_fmeasure_stderr": 0.0018538420089590803, "rougeLsum_precision": 0.0596627080267218, "rougeLsum_precision_stderr": 0.0024418625671780636, "rougeLsum_recall": 0.07373240073484355, "rougeLsum_recall_stderr": 0.0026557359756667646}}, "5": {"tldr_en": {"bleu": 4.93737386207863e-07, "bleu_stderr": 1.1103847235234578e-06, "rouge1_fmeasure": 0.008992771424476102, "rouge1_fmeasure_stderr": 0.0008967572939909481, "rouge1_precision": 0.01062522482426178, "rouge1_precision_stderr": 0.0012092617892710666, "rouge1_recall": 0.01185222370379365, "rouge1_recall_stderr": 0.0012020911444765917, "rouge2_fmeasure": 0.0023077580857873777, "rouge2_fmeasure_stderr": 0.0003193569948458837, "rouge2_precision": 0.002927154149860958, "rouge2_precision_stderr": 0.000550773136343612, "rouge2_recall": 0.002996639644150718, "rouge2_recall_stderr": 0.0004061218977723996, "rougeL_fmeasure": 0.006734668579961652, "rougeL_fmeasure_stderr": 0.0006654911916185634, "rougeL_precision": 0.00826237840691835, "rougeL_precision_stderr": 0.000987476405544466, "rougeL_recall": 0.008978430311958394, "rougeL_recall_stderr": 0.0009171739341734633, "rougeLsum_fmeasure": 0.008370404629808693, "rougeLsum_fmeasure_stderr": 0.000834783139372695, "rougeLsum_precision": 0.009989825480627754, "rougeLsum_precision_stderr": 0.001152294422344019, "rougeLsum_recall": 0.011036254899513882, "rougeLsum_recall_stderr": 0.0011187636476075458}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.5763061428514815, "bleu_stderr": 0.08841895367881542, "rouge1_fmeasure": 0.18522377860735378, "rouge1_fmeasure_stderr": 0.001793031425512448, "rouge1_precision": 0.13693099764528335, "rouge1_precision_stderr": 0.0015356004169807865, "rouge1_recall": 0.30589803506201596, "rouge1_recall_stderr": 0.002569064963211563, "rouge2_fmeasure": 0.0750677147834178, "rouge2_fmeasure_stderr": 0.001192303759950116, "rouge2_precision": 0.05608296665902916, "rouge2_precision_stderr": 0.001060781879450204, "rouge2_recall": 0.12405447717993638, "rouge2_recall_stderr": 0.0019165396671815943, "rougeL_fmeasure": 0.1637374521643533, "rougeL_fmeasure_stderr": 0.0014672700966103522, "rougeL_precision": 0.12077559035745344, "rougeL_precision_stderr": 0.0012933898621559714, "rougeL_recall": 0.2725557899490075, "rougeL_recall_stderr": 0.0021592023164069916, "rougeLsum_fmeasure": 0.16156525416128376, "rougeLsum_fmeasure_stderr": 0.0016584383409070493, "rougeLsum_precision": 0.11941400647321004, "rougeLsum_precision_stderr": 0.0014208570924380114, "rougeLsum_recall": 0.2675565021971549, "rougeLsum_recall_stderr": 0.0024460229189680735}}, "1": {"generate_text_restaurant": {"bleu": 7.889074185047009, "bleu_stderr": 0.12895617864019385, "rouge1_fmeasure": 0.3819329232968168, "rouge1_fmeasure_stderr": 0.002058924047819458, "rouge1_precision": 0.37345813482947465, "rouge1_precision_stderr": 0.00235563045864244, "rouge1_recall": 0.425568058263057, "rouge1_recall_stderr": 0.0027502764404919496, "rouge2_fmeasure": 0.14986852530881986, "rouge2_fmeasure_stderr": 0.001646906448593975, "rouge2_precision": 0.1457310116329793, "rouge2_precision_stderr": 0.0016791811257879754, "rouge2_recall": 0.1691753347651205, "rouge2_recall_stderr": 0.0020338555184736705, "rougeL_fmeasure": 0.26455232520366323, "rougeL_fmeasure_stderr": 0.0016261234117796919, "rougeL_precision": 0.25802392248926737, "rougeL_precision_stderr": 0.0017801562545908425, "rougeL_recall": 0.2964926298625058, "rougeL_recall_stderr": 0.0022135158513236245, "rougeLsum_fmeasure": 0.31785134565072753, "rougeLsum_fmeasure_stderr": 0.0019390533889751562, "rougeLsum_precision": 0.31083590587813814, "rougeLsum_precision_stderr": 0.0021555685632575444, "rougeLsum_recall": 0.3542250176651235, "rougeLsum_recall_stderr": 0.002527520077331439}}, "2": {"generate_text_restaurant": {"bleu": 9.430394273853375, "bleu_stderr": 0.1847232705243506, "rouge1_fmeasure": 0.418885586013751, "rouge1_fmeasure_stderr": 0.002007463993097918, "rouge1_precision": 0.40831216737868514, "rouge1_precision_stderr": 0.0023142524211960777, "rouge1_recall": 0.4661123939279311, "rouge1_recall_stderr": 0.002837142860791685, "rouge2_fmeasure": 0.181238613552319, "rouge2_fmeasure_stderr": 0.0017018054740245462, "rouge2_precision": 0.17588735757182558, "rouge2_precision_stderr": 0.0017565431329739812, "rouge2_recall": 0.20469246454092782, "rouge2_recall_stderr": 0.0021698678454790243, "rougeL_fmeasure": 0.2955881450808913, "rougeL_fmeasure_stderr": 0.0016512442074684325, "rougeL_precision": 0.28785435905822937, "rougeL_precision_stderr": 0.001838982518273614, "rougeL_recall": 0.3302177769679092, "rougeL_recall_stderr": 0.0023254365486443333, "rougeLsum_fmeasure": 0.34857096921066205, "rougeLsum_fmeasure_stderr": 0.0019611752925251216, "rougeLsum_precision": 0.3397731417682527, "rougeLsum_precision_stderr": 0.0021769796707002188, "rougeLsum_recall": 0.3880429438073185, "rougeLsum_recall_stderr": 0.002651222748349836}}, "3": {"generate_text_restaurant": {"bleu": 10.113902500867782, "bleu_stderr": 0.11723679605888841, "rouge1_fmeasure": 0.42494100808108426, "rouge1_fmeasure_stderr": 0.0019848574037000756, "rouge1_precision": 0.4113193603539492, "rouge1_precision_stderr": 0.002326696320360366, "rouge1_recall": 0.4762578661058886, "rouge1_recall_stderr": 0.002803809063258203, "rouge2_fmeasure": 0.18974320403636494, "rouge2_fmeasure_stderr": 0.001728075653184395, "rouge2_precision": 0.18289494079057925, "rouge2_precision_stderr": 0.0017885733052731077, "rouge2_recall": 0.21585034907240166, "rouge2_recall_stderr": 0.0022144084445730352, "rougeL_fmeasure": 0.3038282693785727, "rougeL_fmeasure_stderr": 0.0016658099172296667, "rougeL_precision": 0.2943005681132957, "rougeL_precision_stderr": 0.0019138188823571657, "rougeL_recall": 0.341537961684278, "rougeL_recall_stderr": 0.0023341037495063265, "rougeLsum_fmeasure": 0.35595547534863364, "rougeLsum_fmeasure_stderr": 0.0019811017835625216, "rougeLsum_precision": 0.3448175191058933, "rougeLsum_precision_stderr": 0.0022377191876361987, "rougeLsum_recall": 0.3990338396086915, "rougeLsum_recall_stderr": 0.00266713283320383}}, "4": {"generate_text_restaurant": {"bleu": 10.433235636684694, "bleu_stderr": 0.16559538387279382, "rouge1_fmeasure": 0.4276906087969391, "rouge1_fmeasure_stderr": 0.0019888873745294716, "rouge1_precision": 0.41299825090051584, "rouge1_precision_stderr": 0.0023368767157547292, "rouge1_recall": 0.47962543535821944, "rouge1_recall_stderr": 0.002770194421824991, "rouge2_fmeasure": 0.19376219920647275, "rouge2_fmeasure_stderr": 0.0017880215942833856, "rouge2_precision": 0.18652386544213564, "rouge2_precision_stderr": 0.001859671426693175, "rouge2_recall": 0.22057943475581543, "rouge2_recall_stderr": 0.002268269935834746, "rougeL_fmeasure": 0.30693273428518913, "rougeL_fmeasure_stderr": 0.0017093495953663948, "rougeL_precision": 0.29632873268599635, "rougeL_precision_stderr": 0.0019349036829606914, "rougeL_recall": 0.3453892484465583, "rougeL_recall_stderr": 0.0023533437638236336, "rougeLsum_fmeasure": 0.3575618775034696, "rougeLsum_fmeasure_stderr": 0.0020284523859768924, "rougeLsum_precision": 0.3452141125875863, "rougeLsum_precision_stderr": 0.0022633006759387865, "rougeLsum_recall": 0.4013412942257543, "rougeLsum_recall_stderr": 0.002691394699368152}}, "5": {"generate_text_restaurant": {"bleu": 10.524222254347402, "bleu_stderr": 0.1570733014354235, "rouge1_fmeasure": 0.4297336508488616, "rouge1_fmeasure_stderr": 0.0019877776713711744, "rouge1_precision": 0.41338865705266253, "rouge1_precision_stderr": 0.0023151584252829207, "rouge1_recall": 0.48229437905400363, "rouge1_recall_stderr": 0.002780755795312751, "rouge2_fmeasure": 0.19587324037834244, "rouge2_fmeasure_stderr": 0.0017783986404322938, "rouge2_precision": 0.18750452947717144, "rouge2_precision_stderr": 0.0017854738914506623, "rouge2_recall": 0.22300380292777322, "rouge2_recall_stderr": 0.002292337991373791, "rougeL_fmeasure": 0.3112157685715752, "rougeL_fmeasure_stderr": 0.0017222919884681302, "rougeL_precision": 0.29901837534732933, "rougeL_precision_stderr": 0.0018933521587321512, "rougeL_recall": 0.3505715839455596, "rougeL_recall_stderr": 0.0023947239138506207, "rougeLsum_fmeasure": 0.3605533829938833, "rougeLsum_fmeasure_stderr": 0.0019926332071944987, "rougeLsum_precision": 0.346626530434538, "rougeLsum_precision_stderr": 0.0021961576508456597, "rougeLsum_recall": 0.40525157315277366, "rougeLsum_recall_stderr": 0.002694650983597002}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.6443865340332708, "bleu_stderr": 0.06448573771008022, "rouge1_fmeasure": 0.20894728461895484, "rouge1_fmeasure_stderr": 0.002676706164139504, "rouge1_precision": 0.15977268662525132, "rouge1_precision_stderr": 0.002498597687577891, "rouge1_recall": 0.3439802391671489, "rouge1_recall_stderr": 0.00444649707695747, "rouge2_fmeasure": 0.04399744295623738, "rouge2_fmeasure_stderr": 0.0015740981350718616, "rouge2_precision": 0.03339906984759204, "rouge2_precision_stderr": 0.001300434810680209, "rouge2_recall": 0.07442330924142955, "rouge2_recall_stderr": 0.002682978761295735, "rougeL_fmeasure": 0.1503650560043469, "rougeL_fmeasure_stderr": 0.0019556473810402567, "rougeL_precision": 0.11534576817816335, "rougeL_precision_stderr": 0.0018893138267035729, "rougeL_recall": 0.24796063756999268, "rougeL_recall_stderr": 0.003283686560553736, "rougeLsum_fmeasure": 0.1619728615486753, "rougeLsum_fmeasure_stderr": 0.0021854057188140374, "rougeLsum_precision": 0.12356200850631334, "rougeLsum_precision_stderr": 0.001992934285663947, "rougeLsum_recall": 0.26845293758700167, "rougeLsum_recall_stderr": 0.0037961745448933685}}, "1": {"article_DOC_summary": {"bleu": 1.179463416983736, "bleu_stderr": 0.049322464873007926, "rouge1_fmeasure": 0.17789279128849939, "rouge1_fmeasure_stderr": 0.002409867096248393, "rouge1_precision": 0.1283487315327207, "rouge1_precision_stderr": 0.0018698659023036683, "rouge1_recall": 0.30677115841434355, "rouge1_recall_stderr": 0.004142945172247196, "rouge2_fmeasure": 0.03148436075040743, "rouge2_fmeasure_stderr": 0.0012999151791325903, "rouge2_precision": 0.02229153527620068, "rouge2_precision_stderr": 0.0009290255555238756, "rouge2_recall": 0.05667515961237042, "rouge2_recall_stderr": 0.0024282370591926813, "rougeL_fmeasure": 0.13571854568454508, "rougeL_fmeasure_stderr": 0.0017897390119063444, "rougeL_precision": 0.09766150074500464, "rougeL_precision_stderr": 0.0013689556817164002, "rougeL_recall": 0.2359307623025628, "rougeL_recall_stderr": 0.0032602221353705487, "rougeLsum_fmeasure": 0.1391691564025729, "rougeLsum_fmeasure_stderr": 0.0019802562047267817, "rougeLsum_precision": 0.10010552349835154, "rougeLsum_precision_stderr": 0.00150075160578187, "rougeLsum_recall": 0.24214462905923226, "rougeLsum_recall_stderr": 0.00358086419270699}}, "2": {"article_DOC_summary": {"bleu": 1.171605281185158, "bleu_stderr": 0.09705805598592233, "rouge1_fmeasure": 0.16607212501588745, "rouge1_fmeasure_stderr": 0.002465797526765075, "rouge1_precision": 0.12026259340246569, "rouge1_precision_stderr": 0.001959615202442428, "rouge1_recall": 0.28539768882214345, "rouge1_recall_stderr": 0.0042054437997353904, "rouge2_fmeasure": 0.031093372281268725, "rouge2_fmeasure_stderr": 0.001315103452152175, "rouge2_precision": 0.022171538116183456, "rouge2_precision_stderr": 0.0009481516225085209, "rouge2_recall": 0.05565794662443053, "rouge2_recall_stderr": 0.0024464114450631824, "rougeL_fmeasure": 0.13412112184120395, "rougeL_fmeasure_stderr": 0.001883651866644435, "rougeL_precision": 0.09678284723300769, "rougeL_precision_stderr": 0.0014596259783767793, "rougeL_recall": 0.2322804210017996, "rougeL_recall_stderr": 0.0034048283128554914, "rougeLsum_fmeasure": 0.12819178525497177, "rougeLsum_fmeasure_stderr": 0.001989564938897622, "rougeLsum_precision": 0.09254890915149624, "rougeLsum_precision_stderr": 0.0015482197742084348, "rougeLsum_recall": 0.2221642225599103, "rougeLsum_recall_stderr": 0.0035391189699574043}}, "3": {"article_DOC_summary": {"bleu": 1.2421369448331907, "bleu_stderr": 0.09404310747626213, "rouge1_fmeasure": 0.16125775260746725, "rouge1_fmeasure_stderr": 0.0024929400149637957, "rouge1_precision": 0.11937762777694022, "rouge1_precision_stderr": 0.0020785359403546226, "rouge1_recall": 0.27229911353559627, "rouge1_recall_stderr": 0.004290006567245677, "rouge2_fmeasure": 0.03135863836588418, "rouge2_fmeasure_stderr": 0.0012490119416798898, "rouge2_precision": 0.022594629033534534, "rouge2_precision_stderr": 0.0009043350501605985, "rouge2_recall": 0.05460155665840838, "rouge2_recall_stderr": 0.0022724335465863234, "rougeL_fmeasure": 0.13340144911682686, "rougeL_fmeasure_stderr": 0.001985882478626079, "rougeL_precision": 0.09874208631971298, "rougeL_precision_stderr": 0.001712176793787427, "rougeL_recall": 0.2263072791819812, "rougeL_recall_stderr": 0.0035183479716809104, "rougeLsum_fmeasure": 0.1244865414349181, "rougeLsum_fmeasure_stderr": 0.0020109053608498664, "rougeLsum_precision": 0.09228468276563344, "rougeLsum_precision_stderr": 0.0017353557651595074, "rougeLsum_recall": 0.2115269820097042, "rougeLsum_recall_stderr": 0.003570966623025798}}, "4": {"article_DOC_summary": {"bleu": 0.7507897873071828, "bleu_stderr": 0.09022391990160196, "rouge1_fmeasure": 0.04587050315380673, "rouge1_fmeasure_stderr": 0.002682921061867743, "rouge1_precision": 0.0396817017668287, "rouge1_precision_stderr": 0.0025276425448518785, "rouge1_recall": 0.07000574538399217, "rouge1_recall_stderr": 0.0041107759713876766, "rouge2_fmeasure": 0.009627528506364348, "rouge2_fmeasure_stderr": 0.0009541396465140421, "rouge2_precision": 0.007710704081296013, "rouge2_precision_stderr": 0.0008327806241196672, "rouge2_recall": 0.015032913904949322, "rouge2_recall_stderr": 0.0014695428313471306, "rougeL_fmeasure": 0.03768209350615277, "rougeL_fmeasure_stderr": 0.002167464042953999, "rougeL_precision": 0.03284394634659473, "rougeL_precision_stderr": 0.002105616405808581, "rougeL_recall": 0.05787260392319605, "rougeL_recall_stderr": 0.0033756839248268904, "rougeLsum_fmeasure": 0.03690509099375681, "rougeLsum_fmeasure_stderr": 0.002174042106201563, "rougeLsum_precision": 0.032354994329316655, "rougeLsum_precision_stderr": 0.002122514792763723, "rougeLsum_recall": 0.05653870801404548, "rougeLsum_recall_stderr": 0.0033699559531707246}}, "5": {"article_DOC_summary": {"bleu": 8.215991966026401e-36, "bleu_stderr": 2.035943344842957e-31, "rouge1_fmeasure": 0.0025391991198481386, "rouge1_fmeasure_stderr": 0.0007178274187167163, "rouge1_precision": 0.0029852282009972296, "rouge1_precision_stderr": 0.000878002695182032, "rouge1_recall": 0.002256034533050118, "rouge1_recall_stderr": 0.0006291747215958679, "rouge2_fmeasure": 0.0004511007812344588, "rouge2_fmeasure_stderr": 0.00018994877121643913, "rouge2_precision": 0.0005539957329635465, "rouge2_precision_stderr": 0.0002384720401159147, "rouge2_recall": 0.00039131204225543846, "rouge2_recall_stderr": 0.00016680843472868898, "rougeL_fmeasure": 0.0019230606581652292, "rougeL_fmeasure_stderr": 0.0005609337748918923, "rougeL_precision": 0.00225272205418534, "rougeL_precision_stderr": 0.0006884647595916458, "rougeL_recall": 0.001717413213020426, "rougeL_recall_stderr": 0.0004941575994563838, "rougeLsum_fmeasure": 0.0020240460078641293, "rougeLsum_fmeasure_stderr": 0.0005911504360672772, "rougeLsum_precision": 0.0023567731085693583, "rougeLsum_precision_stderr": 0.0007147557896729491, "rougeLsum_recall": 0.0018155084831467728, "rougeLsum_recall_stderr": 0.0005263836320326491}}}} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..ec36b2b6939116da9887bc1b13e1143669fd19ec --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.314,0.014683991951087966,0 +anli_r2,acc,0.326,0.014830507204541035,0 +anli_r3,acc,0.355,0.013819249004047296,0 +arc_challenge,acc,0.2090443686006826,0.01188274698740645,0 +arc_challenge,acc_norm,0.25170648464163825,0.012682496334042968,0 +arc_easy,acc,0.39057239057239057,0.010011059112064236,0 +arc_easy,acc_norm,0.36658249158249157,0.009887786585323946,0 +boolq,acc,0.5571865443425077,0.008687668766930832,1 +cb,acc,0.32142857142857145,0.06297362289056341,1 +cb,f1,0.28889599317988063,,1 +copa,acc,0.73,0.044619604333847394,0 +hellaswag,acc,0.4563831905994822,0.004970759774676886,0 +hellaswag,acc_norm,0.5928101971718781,0.004903066639761947,0 +piqa,acc,0.6322089227421109,0.011250616646678795,0 +piqa,acc_norm,0.6311207834602829,0.011257546676908809,0 +rte,acc,0.5956678700361011,0.029540420517619716,0 +sciq,acc,0.703,0.014456832294801098,0 +sciq,acc_norm,0.647,0.015120172605483697,0 +storycloze_2016,acc,0.6520577231427044,0.011014779784784828,0 +winogrande,acc,0.5619573796369376,0.013944181296470804,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json deleted file mode 100644 index 336251aee10ad26e1a61abc7ff3170e89e131de3..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.014683991951087966 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.014830507204541035 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047296 - }, - "cb": { - "acc": 0.32142857142857145, - "acc_stderr": 0.06297362289056341, - "f1": 0.28889599317988063 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.044619604333847394 - }, - "hellaswag": { - "acc": 0.4563831905994822, - "acc_stderr": 0.004970759774676886, - "acc_norm": 0.5928101971718781, - "acc_norm_stderr": 0.004903066639761947 - }, - "rte": { - "acc": 0.5956678700361011, - "acc_stderr": 0.029540420517619716 - }, - "winogrande": { - "acc": 0.5619573796369376, - "acc_stderr": 0.013944181296470804 - }, - "storycloze_2016": { - "acc": 0.6520577231427044, - "acc_stderr": 0.011014779784784828 - }, - "boolq": { - "acc": 0.5571865443425077, - "acc_stderr": 0.008687668766930832 - }, - "arc_easy": { - "acc": 0.39057239057239057, - "acc_stderr": 0.010011059112064236, - "acc_norm": 0.36658249158249157, - "acc_norm_stderr": 0.009887786585323946 - }, - "arc_challenge": { - "acc": 0.2090443686006826, - "acc_stderr": 0.01188274698740645, - "acc_norm": 0.25170648464163825, - "acc_norm_stderr": 0.012682496334042968 - }, - "sciq": { - "acc": 0.703, - "acc_stderr": 0.014456832294801098, - "acc_norm": 0.647, - "acc_norm_stderr": 0.015120172605483697 - }, - "piqa": { - "acc": 0.6322089227421109, - "acc_stderr": 0.011250616646678795, - "acc_norm": 0.6311207834602829, - "acc_norm_stderr": 0.011257546676908809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..ee1f84ceab64b364865c77eb9484802ae0c27013 --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.304,0.014553205687950434,0 +anli_r2,acc,0.332,0.014899597242811482,0 +anli_r3,acc,0.34833333333333333,0.013759437498874061,0 +arc_challenge,acc,0.2508532423208191,0.01266819862131543,0 +arc_challenge,acc_norm,0.2764505119453925,0.013069662474252425,0 +arc_easy,acc,0.5096801346801347,0.010257860554461122,0 +arc_easy,acc_norm,0.46296296296296297,0.010231597249131062,0 +boolq,acc,0.6155963302752293,0.008508133844703919,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.30465949820788535,,1 +copa,acc,0.77,0.042295258468165065,0 +hellaswag,acc,0.45429197371041624,0.004968888130290068,0 +hellaswag,acc_norm,0.5927106154152559,0.004903254264177628,0 +piqa,acc,0.6953210010881393,0.010738889044325161,0 +piqa,acc_norm,0.6953210010881393,0.010738889044325161,0 +rte,acc,0.5595667870036101,0.02988212336311872,0 +sciq,acc,0.827,0.011967214137559941,0 +sciq,acc_norm,0.789,0.01290913032104209,0 +storycloze_2016,acc,0.6734366648850882,0.010844543793668893,0 +winogrande,acc,0.5603788476716653,0.013949649776015696,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json deleted file mode 100644 index 8f87bff9ba716a7775f31622271e588b9258c713..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.304, - "acc_stderr": 0.014553205687950434 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811482 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874061 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.30465949820788535 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.042295258468165065 - }, - "hellaswag": { - "acc": 0.45429197371041624, - "acc_stderr": 0.004968888130290068, - "acc_norm": 0.5927106154152559, - "acc_norm_stderr": 0.004903254264177628 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.02988212336311872 - }, - "winogrande": { - "acc": 0.5603788476716653, - "acc_stderr": 0.013949649776015696 - }, - "storycloze_2016": { - "acc": 0.6734366648850882, - "acc_stderr": 0.010844543793668893 - }, - "boolq": { - "acc": 0.6155963302752293, - "acc_stderr": 0.008508133844703919 - }, - "arc_easy": { - "acc": 0.5096801346801347, - "acc_stderr": 0.010257860554461122, - "acc_norm": 0.46296296296296297, - "acc_norm_stderr": 0.010231597249131062 - }, - "arc_challenge": { - "acc": 0.2508532423208191, - "acc_stderr": 0.01266819862131543, - "acc_norm": 0.2764505119453925, - "acc_norm_stderr": 0.013069662474252425 - }, - "sciq": { - "acc": 0.827, - "acc_stderr": 0.011967214137559941, - "acc_norm": 0.789, - "acc_norm_stderr": 0.01290913032104209 - }, - "piqa": { - "acc": 0.6953210010881393, - "acc_stderr": 0.010738889044325161, - "acc_norm": 0.6953210010881393, - "acc_norm_stderr": 0.010738889044325161 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..47c2f91a76084ffa704af472042747beb257d14d --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.311,0.014645596385722695,0 +anli_r2,acc,0.313,0.014671272822977886,0 +anli_r3,acc,0.33166666666666667,0.013596836729485156,0 +arc_challenge,acc,0.2568259385665529,0.0127669237941168,0 +arc_challenge,acc_norm,0.30119453924914674,0.01340674176784762,0 +arc_easy,acc,0.5555555555555556,0.01019625483869168,0 +arc_easy,acc_norm,0.5366161616161617,0.01023223506393303,0 +boolq,acc,0.6061162079510704,0.008545835792614982,1 +cb,acc,0.3392857142857143,0.06384226561930828,1 +cb,f1,0.23827865281885505,,1 +copa,acc,0.8,0.04020151261036845,0 +hellaswag,acc,0.46036646086436966,0.0049740806383642665,0 +hellaswag,acc_norm,0.6048595897231627,0.00487881696101204,0 +piqa,acc,0.719804134929271,0.010478122015577082,0 +piqa,acc_norm,0.7181719260065288,0.010496675231258159,0 +rte,acc,0.4981949458483754,0.030096267148976633,0 +sciq,acc,0.852,0.011234866364235239,0 +sciq,acc_norm,0.834,0.011772110370812185,0 +storycloze_2016,acc,0.6755745590593266,0.01082613134499089,0 +winogrande,acc,0.5580110497237569,0.013957584079109001,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json deleted file mode 100644 index 24cac45800d9dad2e42d638a0c50252bdd1b4dae..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.311, - "acc_stderr": 0.014645596385722695 - }, - "anli_r2": { - "acc": 0.313, - "acc_stderr": 0.014671272822977886 - }, - "anli_r3": { - "acc": 0.33166666666666667, - "acc_stderr": 0.013596836729485156 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930828, - "f1": 0.23827865281885505 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036845 - }, - "hellaswag": { - "acc": 0.46036646086436966, - "acc_stderr": 0.0049740806383642665, - "acc_norm": 0.6048595897231627, - "acc_norm_stderr": 0.00487881696101204 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.5580110497237569, - "acc_stderr": 0.013957584079109001 - }, - "storycloze_2016": { - "acc": 0.6755745590593266, - "acc_stderr": 0.01082613134499089 - }, - "boolq": { - "acc": 0.6061162079510704, - "acc_stderr": 0.008545835792614982 - }, - "arc_easy": { - "acc": 0.5555555555555556, - "acc_stderr": 0.01019625483869168, - "acc_norm": 0.5366161616161617, - "acc_norm_stderr": 0.01023223506393303 - }, - "arc_challenge": { - "acc": 0.2568259385665529, - "acc_stderr": 0.0127669237941168, - "acc_norm": 0.30119453924914674, - "acc_norm_stderr": 0.01340674176784762 - }, - "sciq": { - "acc": 0.852, - "acc_stderr": 0.011234866364235239, - "acc_norm": 0.834, - "acc_norm_stderr": 0.011772110370812185 - }, - "piqa": { - "acc": 0.719804134929271, - "acc_stderr": 0.010478122015577082, - "acc_norm": 0.7181719260065288, - "acc_norm_stderr": 0.010496675231258159 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..ca0f9e8a8abd847da10f3ae27a422949ad89640b --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.309,0.01461960097720649,0 +anli_r2,acc,0.336,0.014944140233795018,0 +anli_r3,acc,0.34833333333333333,0.013759437498874075,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009022,0 +arc_challenge,acc_norm,0.3054607508532423,0.013460080478002498,0 +arc_easy,acc,0.5753367003367004,0.010142653687480416,0 +arc_easy,acc_norm,0.5513468013468014,0.010205540414612871,0 +boolq,acc,0.617737003058104,0.008499149690449273,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.3456203829338158,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.46036646086436966,0.004974080638364265,0 +hellaswag,acc_norm,0.6097390957976498,0.004868117598481941,0 +piqa,acc,0.7377584330794341,0.01026250256517245,0 +piqa,acc_norm,0.7404787812840044,0.010227939888173923,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.848,0.01135891830347528,0 +sciq,acc_norm,0.845,0.011450157470799475,0 +storycloze_2016,acc,0.692143238909674,0.010674598158758186,0 +winogrande,acc,0.5627466456195738,0.013941393310695924,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json deleted file mode 100644 index 09970f7ccdfd2198092cb89473e193458ab74faf..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.309, - "acc_stderr": 0.01461960097720649 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795018 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874075 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.3456203829338158 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.46036646086436966, - "acc_stderr": 0.004974080638364265, - "acc_norm": 0.6097390957976498, - "acc_norm_stderr": 0.004868117598481941 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5627466456195738, - "acc_stderr": 0.013941393310695924 - }, - "storycloze_2016": { - "acc": 0.692143238909674, - "acc_stderr": 0.010674598158758186 - }, - "boolq": { - "acc": 0.617737003058104, - "acc_stderr": 0.008499149690449273 - }, - "arc_easy": { - "acc": 0.5753367003367004, - "acc_stderr": 0.010142653687480416, - "acc_norm": 0.5513468013468014, - "acc_norm_stderr": 0.010205540414612871 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009022, - "acc_norm": 0.3054607508532423, - "acc_norm_stderr": 0.013460080478002498 - }, - "sciq": { - "acc": 0.848, - "acc_stderr": 0.01135891830347528, - "acc_norm": 0.845, - "acc_norm_stderr": 0.011450157470799475 - }, - "piqa": { - "acc": 0.7377584330794341, - "acc_stderr": 0.01026250256517245, - "acc_norm": 0.7404787812840044, - "acc_norm_stderr": 0.010227939888173923 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..786a1f0f4e00fd4538c07bbffaef711b76695cef --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.014965960710224473,0 +anli_r2,acc,0.328,0.014853842487270333,0 +anli_r3,acc,0.3425,0.013704669762934728,0 +arc_challenge,acc,0.2815699658703072,0.01314337673500901,0 +arc_challenge,acc_norm,0.3242320819112628,0.01367881039951882,0 +arc_easy,acc,0.5984848484848485,0.010058790020755572,0 +arc_easy,acc_norm,0.571969696969697,0.01015294331642626,0 +boolq,acc,0.6100917431192661,0.00853043797286262,1 +cb,acc,0.4642857142857143,0.0672477765493766,1 +cb,f1,0.26271604938271603,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.46863174666401114,0.0049799521665955405,0 +hellaswag,acc_norm,0.6181039633539136,0.004848583243606704,0 +piqa,acc,0.7393906420021763,0.010241826155811627,0 +piqa,acc_norm,0.7383025027203483,0.010255630772708229,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.864,0.010845350230472995,0 +sciq,acc_norm,0.862,0.01091215263250441,0 +storycloze_2016,acc,0.6958845537145911,0.010638172655194789,0 +winogrande,acc,0.5643251775848461,0.013935709739615715,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json deleted file mode 100644 index 09a3f0a6baaa07a229486aff64ab81b8ccd42e18..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224473 - }, - "anli_r2": { - "acc": 0.328, - "acc_stderr": 0.014853842487270333 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934728 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.0672477765493766, - "f1": 0.26271604938271603 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.46863174666401114, - "acc_stderr": 0.0049799521665955405, - "acc_norm": 0.6181039633539136, - "acc_norm_stderr": 0.004848583243606704 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5643251775848461, - "acc_stderr": 0.013935709739615715 - }, - "storycloze_2016": { - "acc": 0.6958845537145911, - "acc_stderr": 0.010638172655194789 - }, - "boolq": { - "acc": 0.6100917431192661, - "acc_stderr": 0.00853043797286262 - }, - "arc_easy": { - "acc": 0.5984848484848485, - "acc_stderr": 0.010058790020755572, - "acc_norm": 0.571969696969697, - "acc_norm_stderr": 0.01015294331642626 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.01314337673500901, - "acc_norm": 0.3242320819112628, - "acc_norm_stderr": 0.01367881039951882 - }, - "sciq": { - "acc": 0.864, - "acc_stderr": 0.010845350230472995, - "acc_norm": 0.862, - "acc_norm_stderr": 0.01091215263250441 - }, - "piqa": { - "acc": 0.7393906420021763, - "acc_stderr": 0.010241826155811627, - "acc_norm": 0.7383025027203483, - "acc_norm_stderr": 0.010255630772708229 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5.csv b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..46657ca50cab453748561a8f758e1fe2c83e65ab --- /dev/null +++ b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.01483050720454105,0 +anli_r2,acc,0.317,0.014721675438880226,0 +anli_r3,acc,0.33416666666666667,0.013622434813136783,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009031,0 +arc_challenge,acc_norm,0.3199658703071672,0.013631345807016191,0 +arc_easy,acc,0.6035353535353535,0.010037412763064526,0 +arc_easy,acc_norm,0.5782828282828283,0.010133255284012327,0 +boolq,acc,0.6162079510703364,0.008505584729104967,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.24002574002573998,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.468034256124278,0.0049795737655758555,0 +hellaswag,acc_norm,0.6201951802429795,0.004843462545943488,0 +piqa,acc,0.73449401523395,0.010303308653024429,0 +piqa,acc_norm,0.7459194776931447,0.01015727199913505,0 +rte,acc,0.5379061371841155,0.030009848912529113,0 +sciq,acc,0.875,0.010463483381956722,0 +sciq,acc_norm,0.861,0.010945263761042968,0 +storycloze_2016,acc,0.703901656867985,0.010557307688475116,0 +winogrande,acc,0.5824782951854776,0.013859978264440253,0 diff --git a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json b/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json deleted file mode 100644 index 6383308da1c94c520916c8923e55170599f11915..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed3/evaluation/rankeval/4b284b28bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.01483050720454105 - }, - "anli_r2": { - "acc": 0.317, - "acc_stderr": 0.014721675438880226 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136783 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.24002574002573998 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.468034256124278, - "acc_stderr": 0.0049795737655758555, - "acc_norm": 0.6201951802429795, - "acc_norm_stderr": 0.004843462545943488 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529113 - }, - "winogrande": { - "acc": 0.5824782951854776, - "acc_stderr": 0.013859978264440253 - }, - "storycloze_2016": { - "acc": 0.703901656867985, - "acc_stderr": 0.010557307688475116 - }, - "boolq": { - "acc": 0.6162079510703364, - "acc_stderr": 0.008505584729104967 - }, - "arc_easy": { - "acc": 0.6035353535353535, - "acc_stderr": 0.010037412763064526, - "acc_norm": 0.5782828282828283, - "acc_norm_stderr": 0.010133255284012327 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009031, - "acc_norm": 0.3199658703071672, - "acc_norm_stderr": 0.013631345807016191 - }, - "sciq": { - "acc": 0.875, - "acc_stderr": 0.010463483381956722, - "acc_norm": 0.861, - "acc_norm_stderr": 0.010945263761042968 - }, - "piqa": { - "acc": 0.73449401523395, - "acc_stderr": 0.010303308653024429, - "acc_norm": 0.7459194776931447, - "acc_norm_stderr": 0.01015727199913505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/generation/merged.csv b/4b284b28bc4seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..ffeb2319ff79f4722c62b0f990ab482e31d054c9 --- /dev/null +++ b/4b284b28bc4seed4/evaluation/generation/merged.csv @@ -0,0 +1,27 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.011764220975410813 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.011764220975410813 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19063915520224362 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19063915520224362 +e2e_nlg_cleaned,1,average,multiple,0.10120168808882721 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04480106981761703 +gem_xsum,0,median,rouge2_fmeasure,0.04480106981761703 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.033207183560968925 +gem_xsum,1,median,rouge2_fmeasure,0.033207183560968925 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.036694425298060884 +gem_xsum,2,median,rouge2_fmeasure,0.036694425298060884 +gem_xsum,2,average,multiple,0.038234226225548944 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.048680633498613235 +web_nlg_en,0,median,rouge2_fmeasure,0.048680633498613235 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.053026565677639095 +web_nlg_en,1,median,rouge2_fmeasure,0.053026565677639095 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05071036081345163 +web_nlg_en,2,median,rouge2_fmeasure,0.05071036081345163 +web_nlg_en,2,average,multiple,0.05080585332990132 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03548853418089557 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03548853418089557 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05180034750467234 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05180034750467234 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05616050870223023 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05616050870223023 +wiki_lingua_en,2,average,multiple,0.04781646346259938 diff --git a/4b284b28bc4seed4/evaluation/generation/merged.json b/4b284b28bc4seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..90f745d8685c0041c77a6a4b3f36e83206424cf6 --- /dev/null +++ b/4b284b28bc4seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.31620109957758985, "bleu_stderr": 0.0436977639970438, "rouge1_fmeasure": 0.10315902016726786, "rouge1_fmeasure_stderr": 0.0019060070438482958, "rouge1_precision": 0.06769961049671226, "rouge1_precision_stderr": 0.0014408731719188182, "rouge1_recall": 0.2867407372924206, "rouge1_recall_stderr": 0.004571157006568459, "rouge2_fmeasure": 0.048680633498613235, "rouge2_fmeasure_stderr": 0.0012001070226091816, "rouge2_precision": 0.031623473816054536, "rouge2_precision_stderr": 0.0008563816673904925, "rouge2_recall": 0.14002477235273347, "rouge2_recall_stderr": 0.0030829927271468455, "rougeL_fmeasure": 0.10001471836261917, "rougeL_fmeasure_stderr": 0.0018101971022073298, "rougeL_precision": 0.06551735699492812, "rougeL_precision_stderr": 0.001360472354290908, "rougeL_recall": 0.27974078406805947, "rougeL_recall_stderr": 0.004483148560424672, "rougeLsum_fmeasure": 0.09908745681833024, "rougeLsum_fmeasure_stderr": 0.001816434962081851, "rougeLsum_precision": 0.06504923908690437, "rougeLsum_precision_stderr": 0.0013793228194582826, "rougeLsum_recall": 0.2754429077514643, "rougeLsum_recall_stderr": 0.00433299850418606}}, "1": {"PALM_prompt": {"bleu": 0.4465798489958593, "bleu_stderr": 0.03821208721897477, "rouge1_fmeasure": 0.11510221349692805, "rouge1_fmeasure_stderr": 0.0018417715919236185, "rouge1_precision": 0.07371823857673165, "rouge1_precision_stderr": 0.0013763349225185478, "rouge1_recall": 0.3688809731011637, "rouge1_recall_stderr": 0.005115562578388885, "rouge2_fmeasure": 0.053026565677639095, "rouge2_fmeasure_stderr": 0.0011718066518224836, "rouge2_precision": 0.03378602637117124, "rouge2_precision_stderr": 0.000831107209277974, "rouge2_recall": 0.17620010741543168, "rouge2_recall_stderr": 0.0034569647317057232, "rougeL_fmeasure": 0.10846810658720578, "rougeL_fmeasure_stderr": 0.0016257087585928257, "rougeL_precision": 0.06938216236815531, "rougeL_precision_stderr": 0.0012090035216715325, "rougeL_recall": 0.34811809237457786, "rougeL_recall_stderr": 0.004687691743161809, "rougeLsum_fmeasure": 0.11005414089372284, "rougeLsum_fmeasure_stderr": 0.0017281404618979693, "rougeLsum_precision": 0.07050615181309323, "rougeLsum_precision_stderr": 0.0012933152319356035, "rougeLsum_recall": 0.3514069079411779, "rougeLsum_recall_stderr": 0.004710020922149002}}, "2": {"PALM_prompt": {"bleu": 0.49390694019851883, "bleu_stderr": 0.03283528773113811, "rouge1_fmeasure": 0.11167166378709677, "rouge1_fmeasure_stderr": 0.0016694573474676788, "rouge1_precision": 0.07053978792457792, "rouge1_precision_stderr": 0.0012141148921630063, "rouge1_recall": 0.3722085841928087, "rouge1_recall_stderr": 0.004932224410126405, "rouge2_fmeasure": 0.05071036081345163, "rouge2_fmeasure_stderr": 0.0010731951740562927, "rouge2_precision": 0.031882639973859254, "rouge2_precision_stderr": 0.0007440437082987532, "rouge2_recall": 0.179162100861545, "rouge2_recall_stderr": 0.0036067927027667616, "rougeL_fmeasure": 0.10482960375745125, "rougeL_fmeasure_stderr": 0.0015367324295981716, "rougeL_precision": 0.0662695912572972, "rougeL_precision_stderr": 0.001111137879131896, "rougeL_recall": 0.34634376427492525, "rougeL_recall_stderr": 0.004426222539040666, "rougeLsum_fmeasure": 0.10651850585213189, "rougeLsum_fmeasure_stderr": 0.0015844678024154568, "rougeLsum_precision": 0.06731544242555773, "rougeLsum_precision_stderr": 0.0011479427958618216, "rougeLsum_recall": 0.35393011607524494, "rougeLsum_recall_stderr": 0.004611169262021209}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.60885283648322, "bleu_stderr": 0.046404803775343256, "rouge1_fmeasure": 0.16981770764951246, "rouge1_fmeasure_stderr": 0.0018844380426673607, "rouge1_precision": 0.146341713551491, "rouge1_precision_stderr": 0.001907488992714226, "rouge1_recall": 0.24303710279927596, "rouge1_recall_stderr": 0.0026740629162863815, "rouge2_fmeasure": 0.03548853418089557, "rouge2_fmeasure_stderr": 0.0008596537307285957, "rouge2_precision": 0.030255907844947725, "rouge2_precision_stderr": 0.0007661823292966557, "rouge2_recall": 0.0524686714748099, "rouge2_recall_stderr": 0.0013880458970171093, "rougeL_fmeasure": 0.1358656284421895, "rougeL_fmeasure_stderr": 0.001389903118786626, "rougeL_precision": 0.11563887486905755, "rougeL_precision_stderr": 0.001369806180194911, "rougeL_recall": 0.1990169050957429, "rougeL_recall_stderr": 0.00219255306093255, "rougeLsum_fmeasure": 0.15524577102172657, "rougeLsum_fmeasure_stderr": 0.0017126012169208825, "rougeLsum_precision": 0.13357747346540397, "rougeLsum_precision_stderr": 0.0017299539524552423, "rougeLsum_recall": 0.22314380798385272, "rougeLsum_recall_stderr": 0.0024794181108329524}}, "1": {"tldr_en": {"bleu": 2.725412897374069, "bleu_stderr": 0.06590713364460175, "rouge1_fmeasure": 0.21273278313561203, "rouge1_fmeasure_stderr": 0.002011144383038581, "rouge1_precision": 0.19152350230841855, "rouge1_precision_stderr": 0.0023810619465519586, "rouge1_recall": 0.302694565096085, "rouge1_recall_stderr": 0.002871626633712586, "rouge2_fmeasure": 0.05180034750467234, "rouge2_fmeasure_stderr": 0.0010557187582866223, "rouge2_precision": 0.04768841739901736, "rouge2_precision_stderr": 0.0012211060679656308, "rouge2_recall": 0.07569607558899676, "rouge2_recall_stderr": 0.0016977441994545117, "rougeL_fmeasure": 0.15200924118476186, "rougeL_fmeasure_stderr": 0.0013679053352126194, "rougeL_precision": 0.13653437043291017, "rougeL_precision_stderr": 0.001724532681214248, "rougeL_recall": 0.22191488232376044, "rougeL_recall_stderr": 0.0022628821878105465, "rougeLsum_fmeasure": 0.19969478957091424, "rougeLsum_fmeasure_stderr": 0.0018860691826304264, "rougeLsum_precision": 0.17974451166450459, "rougeLsum_precision_stderr": 0.0022478628561453545, "rougeLsum_recall": 0.28504093466511404, "rougeLsum_recall_stderr": 0.0027223245492742462}}, "2": {"tldr_en": {"bleu": 3.0660991212944317, "bleu_stderr": 0.10372713065464424, "rouge1_fmeasure": 0.21977815129794487, "rouge1_fmeasure_stderr": 0.0019325468923461176, "rouge1_precision": 0.21412338509678533, "rouge1_precision_stderr": 0.0028281010919592436, "rouge1_recall": 0.30461283300382674, "rouge1_recall_stderr": 0.002825271119088337, "rouge2_fmeasure": 0.05616050870223023, "rouge2_fmeasure_stderr": 0.0011252548440132944, "rouge2_precision": 0.05888063726692182, "rouge2_precision_stderr": 0.0017538775810198863, "rouge2_recall": 0.0782305781107223, "rouge2_recall_stderr": 0.001697698646970674, "rougeL_fmeasure": 0.15708445866755325, "rougeL_fmeasure_stderr": 0.0013652347675896105, "rougeL_precision": 0.15547333657821524, "rougeL_precision_stderr": 0.0023132587548469795, "rougeL_recall": 0.22142616128174153, "rougeL_recall_stderr": 0.0022228357872264405, "rougeLsum_fmeasure": 0.20703520109346182, "rougeLsum_fmeasure_stderr": 0.0018184636268796823, "rougeLsum_precision": 0.20218982558840237, "rougeLsum_precision_stderr": 0.002724146894044121, "rougeLsum_recall": 0.28724184959573407, "rougeLsum_recall_stderr": 0.002676027470194893}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.9486318659821757, "bleu_stderr": 0.046054186708753594, "rouge1_fmeasure": 0.06647296884394631, "rouge1_fmeasure_stderr": 0.0013703971424785793, "rouge1_precision": 0.05127435604947866, "rouge1_precision_stderr": 0.0010816160215933884, "rouge1_recall": 0.09809462257860511, "rouge1_recall_stderr": 0.001988448826829593, "rouge2_fmeasure": 0.011764220975410813, "rouge2_fmeasure_stderr": 0.0005194241761303413, "rouge2_precision": 0.009013651576884465, "rouge2_precision_stderr": 0.0003983827007173943, "rouge2_recall": 0.017566860336679598, "rouge2_recall_stderr": 0.0007876531254085971, "rougeL_fmeasure": 0.06585835855034873, "rougeL_fmeasure_stderr": 0.0013503477728497991, "rougeL_precision": 0.050780395408484696, "rougeL_precision_stderr": 0.001064268150921614, "rougeL_recall": 0.09725254219294728, "rougeL_recall_stderr": 0.001963943905207662, "rougeLsum_fmeasure": 0.05626251970787973, "rougeLsum_fmeasure_stderr": 0.0011217151771389764, "rougeLsum_precision": 0.04328842801917388, "rougeLsum_precision_stderr": 0.0008811589017402656, "rougeLsum_recall": 0.083491757226437, "rougeLsum_recall_stderr": 0.0016470232472348585}}, "1": {"generate_text_restaurant": {"bleu": 10.736417414826192, "bleu_stderr": 0.12883315690574576, "rouge1_fmeasure": 0.42347669756200107, "rouge1_fmeasure_stderr": 0.002299761985015768, "rouge1_precision": 0.4994281816248813, "rouge1_precision_stderr": 0.00315562260432105, "rouge1_recall": 0.4074682552684028, "rouge1_recall_stderr": 0.0028860632451414964, "rouge2_fmeasure": 0.19063915520224362, "rouge2_fmeasure_stderr": 0.001908484563894186, "rouge2_precision": 0.22810495987567186, "rouge2_precision_stderr": 0.0024574603360052174, "rouge2_recall": 0.1831102494700893, "rouge2_recall_stderr": 0.002053236226953995, "rougeL_fmeasure": 0.30934327771314435, "rougeL_fmeasure_stderr": 0.0019536593166029873, "rougeL_precision": 0.3668088539701383, "rougeL_precision_stderr": 0.0027243573366763025, "rougeL_recall": 0.29737795551223317, "rougeL_recall_stderr": 0.0023390292209083986, "rougeLsum_fmeasure": 0.34772596056303123, "rougeLsum_fmeasure_stderr": 0.0022228086994649674, "rougeLsum_precision": 0.41086699735293175, "rougeLsum_precision_stderr": 0.00298010231762013, "rougeLsum_recall": 0.33421135617493986, "rougeLsum_recall_stderr": 0.0026325643111599697}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.022928782196655, "bleu_stderr": 0.06323504041482224, "rouge1_fmeasure": 0.20850602933405124, "rouge1_fmeasure_stderr": 0.0025783509188228872, "rouge1_precision": 0.1690176511800054, "rouge1_precision_stderr": 0.0025057503766924764, "rouge1_recall": 0.3169600025152783, "rouge1_recall_stderr": 0.004390086610875057, "rouge2_fmeasure": 0.04480106981761703, "rouge2_fmeasure_stderr": 0.0016469393826685948, "rouge2_precision": 0.035231821829655605, "rouge2_precision_stderr": 0.0014122236080052043, "rouge2_recall": 0.0721669404414664, "rouge2_recall_stderr": 0.0026942676554950196, "rougeL_fmeasure": 0.15783763779709026, "rougeL_fmeasure_stderr": 0.0020045338016978164, "rougeL_precision": 0.1276621204119681, "rougeL_precision_stderr": 0.0019441096332612402, "rougeL_recall": 0.24175632866951158, "rougeL_recall_stderr": 0.0035366671887579804, "rougeLsum_fmeasure": 0.16234750057160516, "rougeLsum_fmeasure_stderr": 0.0021521921302323334, "rougeLsum_precision": 0.13096994127744102, "rougeLsum_precision_stderr": 0.002011329849282318, "rougeLsum_recall": 0.24916483206303525, "rougeLsum_recall_stderr": 0.003801039212088291}}, "1": {"article_DOC_summary": {"bleu": 1.3013783094677602, "bleu_stderr": 0.05323851494545823, "rouge1_fmeasure": 0.17053066710298548, "rouge1_fmeasure_stderr": 0.00248505468489638, "rouge1_precision": 0.1208560729467065, "rouge1_precision_stderr": 0.0018331156830519264, "rouge1_recall": 0.3011735743290485, "rouge1_recall_stderr": 0.004262574821709688, "rouge2_fmeasure": 0.033207183560968925, "rouge2_fmeasure_stderr": 0.001348828018973971, "rouge2_precision": 0.023321914774610174, "rouge2_precision_stderr": 0.0009490899905683155, "rouge2_recall": 0.06000231967162931, "rouge2_recall_stderr": 0.002490214577848948, "rougeL_fmeasure": 0.13262019002149206, "rougeL_fmeasure_stderr": 0.0018340910251430245, "rougeL_precision": 0.09381616328217902, "rougeL_precision_stderr": 0.0013434290825839483, "rougeL_recall": 0.23579210005809345, "rougeL_recall_stderr": 0.0033043055105988852, "rougeLsum_fmeasure": 0.13665130703802245, "rougeLsum_fmeasure_stderr": 0.0019843002642716883, "rougeLsum_precision": 0.09660547186408498, "rougeLsum_precision_stderr": 0.001445839504863143, "rougeLsum_recall": 0.24312964604533158, "rougeLsum_recall_stderr": 0.0035600019821752133}}, "2": {"article_DOC_summary": {"bleu": 1.4610227609434248, "bleu_stderr": 0.10327228234775739, "rouge1_fmeasure": 0.17640095609918918, "rouge1_fmeasure_stderr": 0.0024236476790657165, "rouge1_precision": 0.12496533702098848, "rouge1_precision_stderr": 0.0017902842566202737, "rouge1_recall": 0.31218014639465674, "rouge1_recall_stderr": 0.004202785523007602, "rouge2_fmeasure": 0.036694425298060884, "rouge2_fmeasure_stderr": 0.001436730910285075, "rouge2_precision": 0.02572420280473422, "rouge2_precision_stderr": 0.0010068612303705702, "rouge2_recall": 0.06686357551325348, "rouge2_recall_stderr": 0.002695792942377374, "rougeL_fmeasure": 0.13833597033877038, "rougeL_fmeasure_stderr": 0.0018696740518708518, "rougeL_precision": 0.09783615822934287, "rougeL_precision_stderr": 0.0013714730714619002, "rougeL_recall": 0.2462675293448225, "rougeL_recall_stderr": 0.0033637471659569394, "rougeLsum_fmeasure": 0.14067838491016335, "rougeLsum_fmeasure_stderr": 0.0020231190670618898, "rougeLsum_precision": 0.0994096762980347, "rougeLsum_precision_stderr": 0.001470776129912826, "rougeLsum_recall": 0.25080300299758923, "rougeLsum_recall_stderr": 0.0036598233956450744}}}} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..38f8ff01e92fea40304aa14392959b99679d0350 --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095526,0 +anli_r2,acc,0.329,0.014865395385928367,0 +anli_r3,acc,0.3325,0.013605417345710528,0 +arc_challenge,acc,0.28668941979522183,0.01321498632927478,0 +arc_challenge,acc_norm,0.2986348122866894,0.01337407861506875,0 +arc_easy,acc,0.6039562289562289,0.010035580962097944,0 +arc_easy,acc_norm,0.5429292929292929,0.01022189756425604,0 +boolq,acc,0.591131498470948,0.008598573693259113,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.3156716417910448,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.476000796654053,0.00498403025050729,0 +hellaswag,acc_norm,0.6229834694284008,0.004836486437527274,0 +piqa,acc,0.749183895538629,0.010113869547069044,0 +piqa,acc_norm,0.7682263329706203,0.009845143772794046,0 +rte,acc,0.555956678700361,0.029907396333795987,0 +sciq,acc,0.857,0.01107581480856704,0 +sciq,acc_norm,0.764,0.013434451402438676,0 +storycloze_2016,acc,0.7172634954569749,0.01041380648612127,0 +winogrande,acc,0.5880031570639306,0.013833112857645938,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json deleted file mode 100644 index 6129d8090c05dd83c83215a3682e3bd3635f1d28..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-05_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928367 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.013605417345710528 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.3156716417910448 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.476000796654053, - "acc_stderr": 0.00498403025050729, - "acc_norm": 0.6229834694284008, - "acc_norm_stderr": 0.004836486437527274 - }, - "rte": { - "acc": 0.555956678700361, - "acc_stderr": 0.029907396333795987 - }, - "winogrande": { - "acc": 0.5880031570639306, - "acc_stderr": 0.013833112857645938 - }, - "storycloze_2016": { - "acc": 0.7172634954569749, - "acc_stderr": 0.01041380648612127 - }, - "boolq": { - "acc": 0.591131498470948, - "acc_stderr": 0.008598573693259113 - }, - "arc_easy": { - "acc": 0.6039562289562289, - "acc_stderr": 0.010035580962097944, - "acc_norm": 0.5429292929292929, - "acc_norm_stderr": 0.01022189756425604 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.01321498632927478, - "acc_norm": 0.2986348122866894, - "acc_norm_stderr": 0.01337407861506875 - }, - "sciq": { - "acc": 0.857, - "acc_stderr": 0.01107581480856704, - "acc_norm": 0.764, - "acc_norm_stderr": 0.013434451402438676 - }, - "piqa": { - "acc": 0.749183895538629, - "acc_stderr": 0.010113869547069044, - "acc_norm": 0.7682263329706203, - "acc_norm_stderr": 0.009845143772794046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..5e5abcf1c51ec7f19fa51c7bd9801288009ec63f --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270336,0 +anli_r2,acc,0.326,0.014830507204541028,0 +anli_r3,acc,0.3491666666666667,0.01376707539507725,0 +arc_challenge,acc,0.29436860068259385,0.013318528460539422,0 +arc_challenge,acc_norm,0.30631399317406144,0.013470584417276511,0 +arc_easy,acc,0.6178451178451179,0.009970747281292436,0 +arc_easy,acc_norm,0.5824915824915825,0.010119187377776038,0 +boolq,acc,0.5935779816513761,0.008590531708882184,1 +cb,acc,0.48214285714285715,0.06737697508644648,1 +cb,f1,0.3421052631578947,,1 +copa,acc,0.8,0.04020151261036845,0 +hellaswag,acc,0.47470623381796456,0.004983392650570958,0 +hellaswag,acc_norm,0.6215893248356901,0.00483999574560232,0 +piqa,acc,0.7513601741022851,0.010084511234296859,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858914,0 +rte,acc,0.5631768953068592,0.02985524739031495,0 +sciq,acc,0.902,0.009406619184621243,0 +sciq,acc_norm,0.864,0.01084535023047299,0 +storycloze_2016,acc,0.7167290219134153,0.010419760409155363,0 +winogrande,acc,0.574585635359116,0.013895257666646382,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json deleted file mode 100644 index ceecf8cb7d5ab2b2b57632a3be821caf84975453..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-05_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270336 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.014830507204541028 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.01376707539507725 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.06737697508644648, - "f1": 0.3421052631578947 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.04020151261036845 - }, - "hellaswag": { - "acc": 0.47470623381796456, - "acc_stderr": 0.004983392650570958, - "acc_norm": 0.6215893248356901, - "acc_norm_stderr": 0.00483999574560232 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.02985524739031495 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646382 - }, - "storycloze_2016": { - "acc": 0.7167290219134153, - "acc_stderr": 0.010419760409155363 - }, - "boolq": { - "acc": 0.5935779816513761, - "acc_stderr": 0.008590531708882184 - }, - "arc_easy": { - "acc": 0.6178451178451179, - "acc_stderr": 0.009970747281292436, - "acc_norm": 0.5824915824915825, - "acc_norm_stderr": 0.010119187377776038 - }, - "arc_challenge": { - "acc": 0.29436860068259385, - "acc_stderr": 0.013318528460539422, - "acc_norm": 0.30631399317406144, - "acc_norm_stderr": 0.013470584417276511 - }, - "sciq": { - "acc": 0.902, - "acc_stderr": 0.009406619184621243, - "acc_norm": 0.864, - "acc_norm_stderr": 0.01084535023047299 - }, - "piqa": { - "acc": 0.7513601741022851, - "acc_stderr": 0.010084511234296859, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858914 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..4c7f5aa930d0a7ada8b493546c97007f3f54af4b --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.322,0.014782913600996653,0 +anli_r2,acc,0.349,0.015080663991563098,0 +anli_r3,acc,0.3233333333333333,0.013508372867300219,0 +arc_challenge,acc,0.29266211604095566,0.01329591610361943,0 +arc_challenge,acc_norm,0.3302047781569966,0.013743085603760427,0 +arc_easy,acc,0.6245791245791246,0.0099362185271143,0 +arc_easy,acc_norm,0.6052188552188552,0.01003003893588359,0 +boolq,acc,0.6128440366972477,0.008519429207594416,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.2588235294117647,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.4731129257120096,0.004982561815214124,0 +hellaswag,acc_norm,0.6232822146982673,0.00483572890373141,0 +piqa,acc,0.7453754080522307,0.01016443223706049,0 +piqa,acc_norm,0.7611534276387377,0.009948120385337484,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.889,0.009938701010583726,0 +storycloze_2016,acc,0.7172634954569749,0.01041380648612127,0 +winogrande,acc,0.5974743488555643,0.01378286683170305,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json deleted file mode 100644 index 7493e8b9a808beb1303470e4845a747a7185774c..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-05_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.322, - "acc_stderr": 0.014782913600996653 - }, - "anli_r2": { - "acc": 0.349, - "acc_stderr": 0.015080663991563098 - }, - "anli_r3": { - "acc": 0.3233333333333333, - "acc_stderr": 0.013508372867300219 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.2588235294117647 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4731129257120096, - "acc_stderr": 0.004982561815214124, - "acc_norm": 0.6232822146982673, - "acc_norm_stderr": 0.00483572890373141 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.01378286683170305 - }, - "storycloze_2016": { - "acc": 0.7172634954569749, - "acc_stderr": 0.01041380648612127 - }, - "boolq": { - "acc": 0.6128440366972477, - "acc_stderr": 0.008519429207594416 - }, - "arc_easy": { - "acc": 0.6245791245791246, - "acc_stderr": 0.0099362185271143, - "acc_norm": 0.6052188552188552, - "acc_norm_stderr": 0.01003003893588359 - }, - "arc_challenge": { - "acc": 0.29266211604095566, - "acc_stderr": 0.01329591610361943, - "acc_norm": 0.3302047781569966, - "acc_norm_stderr": 0.013743085603760427 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.889, - "acc_norm_stderr": 0.009938701010583726 - }, - "piqa": { - "acc": 0.7453754080522307, - "acc_stderr": 0.01016443223706049, - "acc_norm": 0.7611534276387377, - "acc_norm_stderr": 0.009948120385337484 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..5b261258a78b45a9fa3209c537fbfb6c31ea838b --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.311,0.014645596385722692,0 +anli_r2,acc,0.356,0.015149042659306626,0 +anli_r3,acc,0.3425,0.013704669762934727,0 +arc_challenge,acc,0.2960750853242321,0.013340916085246258,0 +arc_challenge,acc_norm,0.3122866894197952,0.013542598541688065,0 +arc_easy,acc,0.6292087542087542,0.009911292822056921,0 +arc_easy,acc_norm,0.6153198653198653,0.009983171707008999,0 +boolq,acc,0.6103975535168196,0.008529228894936293,1 +cb,acc,0.5535714285714286,0.06703189227942397,1 +cb,f1,0.49967511371020135,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.4714200358494324,0.004981623292196191,0 +hellaswag,acc_norm,0.6261700856403107,0.004828305041904401,0 +piqa,acc,0.7578890097932536,0.009994371269104376,0 +piqa,acc_norm,0.764417845484222,0.009901067586473886,0 +rte,acc,0.5342960288808665,0.03002557981936643,0 +sciq,acc,0.911,0.009008893392651535,0 +sciq,acc_norm,0.892,0.009820001651345693,0 +storycloze_2016,acc,0.729021913415286,0.010278188399635043,0 +winogrande,acc,0.6006314127861089,0.013764933546717612,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json deleted file mode 100644 index 575fb97b6be3cb2cc1f232f7ec91435a68cdecb7..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-05_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.311, - "acc_stderr": 0.014645596385722692 - }, - "anli_r2": { - "acc": 0.356, - "acc_stderr": 0.015149042659306626 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934727 - }, - "cb": { - "acc": 0.5535714285714286, - "acc_stderr": 0.06703189227942397, - "f1": 0.49967511371020135 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.4714200358494324, - "acc_stderr": 0.004981623292196191, - "acc_norm": 0.6261700856403107, - "acc_norm_stderr": 0.004828305041904401 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.03002557981936643 - }, - "winogrande": { - "acc": 0.6006314127861089, - "acc_stderr": 0.013764933546717612 - }, - "storycloze_2016": { - "acc": 0.729021913415286, - "acc_stderr": 0.010278188399635043 - }, - "boolq": { - "acc": 0.6103975535168196, - "acc_stderr": 0.008529228894936293 - }, - "arc_easy": { - "acc": 0.6292087542087542, - "acc_stderr": 0.009911292822056921, - "acc_norm": 0.6153198653198653, - "acc_norm_stderr": 0.009983171707008999 - }, - "arc_challenge": { - "acc": 0.2960750853242321, - "acc_stderr": 0.013340916085246258, - "acc_norm": 0.3122866894197952, - "acc_norm_stderr": 0.013542598541688065 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651535, - "acc_norm": 0.892, - "acc_norm_stderr": 0.009820001651345693 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104376, - "acc_norm": 0.764417845484222, - "acc_norm_stderr": 0.009901067586473886 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..785095a5c36ec8ed49024bdad97194391c9002f3 --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.344,0.015029633724408943,0 +anli_r2,acc,0.374,0.015308767369006356,0 +anli_r3,acc,0.34833333333333333,0.01375943749887407,0 +arc_challenge,acc,0.302901023890785,0.013428241573185349,0 +arc_challenge,acc_norm,0.3242320819112628,0.01367881039951882,0 +arc_easy,acc,0.6262626262626263,0.009927267058259625,0 +arc_easy,acc_norm,0.61489898989899,0.009985214798737251,0 +boolq,acc,0.6168195718654435,0.008503021391450788,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.19999999999999998,,1 +copa,acc,0.77,0.04229525846816505,0 +hellaswag,acc,0.4715196176060546,0.004981680090303699,0 +hellaswag,acc_norm,0.6269667396932882,0.004826224784850446,0 +piqa,acc,0.7529923830250272,0.010062268140772624,0 +piqa,acc_norm,0.7616974972796517,0.009940334245876219,0 +rte,acc,0.48736462093862815,0.030086851767188564,0 +sciq,acc,0.912,0.008963053962592078,0 +sciq,acc_norm,0.896,0.009658016218524301,0 +storycloze_2016,acc,0.7327632282202031,0.010233145255103061,0 +winogrande,acc,0.5864246250986582,0.013840971763195304,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json deleted file mode 100644 index b56dfdf3f1edbfaef8cacb230d1f7fab389105e8..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-05_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.344, - "acc_stderr": 0.015029633724408943 - }, - "anli_r2": { - "acc": 0.374, - "acc_stderr": 0.015308767369006356 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.01375943749887407 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.19999999999999998 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816505 - }, - "hellaswag": { - "acc": 0.4715196176060546, - "acc_stderr": 0.004981680090303699, - "acc_norm": 0.6269667396932882, - "acc_norm_stderr": 0.004826224784850446 - }, - "rte": { - "acc": 0.48736462093862815, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195304 - }, - "storycloze_2016": { - "acc": 0.7327632282202031, - "acc_stderr": 0.010233145255103061 - }, - "boolq": { - "acc": 0.6168195718654435, - "acc_stderr": 0.008503021391450788 - }, - "arc_easy": { - "acc": 0.6262626262626263, - "acc_stderr": 0.009927267058259625, - "acc_norm": 0.61489898989899, - "acc_norm_stderr": 0.009985214798737251 - }, - "arc_challenge": { - "acc": 0.302901023890785, - "acc_stderr": 0.013428241573185349, - "acc_norm": 0.3242320819112628, - "acc_norm_stderr": 0.01367881039951882 - }, - "sciq": { - "acc": 0.912, - "acc_stderr": 0.008963053962592078, - "acc_norm": 0.896, - "acc_norm_stderr": 0.009658016218524301 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772624, - "acc_norm": 0.7616974972796517, - "acc_norm_stderr": 0.009940334245876219 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5.csv b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..9eb49f155f9d07284c39f72c7c754ebb4df0a4af --- /dev/null +++ b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.36,0.015186527932040126,0 +anli_r2,acc,0.368,0.015258073561521802,0 +anli_r3,acc,0.335,0.013630871843821474,0 +arc_challenge,acc,0.302901023890785,0.013428241573185349,0 +arc_challenge,acc_norm,0.3199658703071672,0.013631345807016193,0 +arc_easy,acc,0.6233164983164983,0.009942848077476172,0 +arc_easy,acc_norm,0.6220538720538721,0.009949405744045481,0 +boolq,acc,0.6079510703363914,0.008538802914911995,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.2222222222222222,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4727145986855208,0.004982346155911132,0 +hellaswag,acc_norm,0.6279625572595101,0.004823604775015894,0 +piqa,acc,0.749727965179543,0.01010656188008979,0 +piqa,acc_norm,0.7546245919477693,0.010039831320422384,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.912,0.008963053962592085,0 +sciq,acc_norm,0.905,0.0092769101031033,0 +storycloze_2016,acc,0.7220737573490112,0.010359403651225865,0 +winogrande,acc,0.5895816890292028,0.01382510712003587,0 diff --git a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json b/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json deleted file mode 100644 index aadf4bb554b489342b6900d82cac3dd79e44390d..0000000000000000000000000000000000000000 --- a/4b284b28bc4seed4/evaluation/rankeval/4b284b28bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-05_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.36, - "acc_stderr": 0.015186527932040126 - }, - "anli_r2": { - "acc": 0.368, - "acc_stderr": 0.015258073561521802 - }, - "anli_r3": { - "acc": 0.335, - "acc_stderr": 0.013630871843821474 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.2222222222222222 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4727145986855208, - "acc_stderr": 0.004982346155911132, - "acc_norm": 0.6279625572595101, - "acc_norm_stderr": 0.004823604775015894 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.01382510712003587 - }, - "storycloze_2016": { - "acc": 0.7220737573490112, - "acc_stderr": 0.010359403651225865 - }, - "boolq": { - "acc": 0.6079510703363914, - "acc_stderr": 0.008538802914911995 - }, - "arc_easy": { - "acc": 0.6233164983164983, - "acc_stderr": 0.009942848077476172, - "acc_norm": 0.6220538720538721, - "acc_norm_stderr": 0.009949405744045481 - }, - "arc_challenge": { - "acc": 0.302901023890785, - "acc_stderr": 0.013428241573185349, - "acc_norm": 0.3199658703071672, - "acc_norm_stderr": 0.013631345807016193 - }, - "sciq": { - "acc": 0.912, - "acc_stderr": 0.008963053962592085, - "acc_norm": 0.905, - "acc_norm_stderr": 0.0092769101031033 - }, - "piqa": { - "acc": 0.749727965179543, - "acc_stderr": 0.01010656188008979, - "acc_norm": 0.7546245919477693, - "acc_norm_stderr": 0.010039831320422384 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..59f14cac508897d12cd38c3db2adf4d2238fb4be --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732963,0 +anli_r2,acc,0.336,0.014944140233795027,0 +anli_r3,acc,0.33666666666666667,0.013647602942406396,0 +arc_challenge,acc,0.2815699658703072,0.013143376735009019,0 +arc_challenge,acc_norm,0.29948805460750855,0.013385021637313562,0 +arc_easy,acc,0.5989057239057239,0.010057051106534364,0 +arc_easy,acc_norm,0.5290404040404041,0.010242463826395614,0 +boolq,acc,0.608868501529052,0.008535239054221164,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.18803418803418803,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4736108344951205,0.004982826916687148,0 +hellaswag,acc_norm,0.6192989444333798,0.004845668799108534,0 +piqa,acc,0.7568008705114254,0.01000961195385892,0 +piqa,acc_norm,0.7546245919477693,0.010039831320422386,0 +rte,acc,0.5306859205776173,0.030039730592197812,0 +sciq,acc,0.829,0.011912216456264604,0 +sciq,acc_norm,0.746,0.01377220656516854,0 +storycloze_2016,acc,0.7177979690005345,0.010407834479647673,0 +winogrande,acc,0.5974743488555643,0.013782866831703044,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json deleted file mode 100644 index 40aad1114e575a7bfa72099a90320d06c8fac4e5..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_0_lm-eval_global_step80108_2023-02-15-11-04-03_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795027 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406396 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.18803418803418803 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4736108344951205, - "acc_stderr": 0.004982826916687148, - "acc_norm": 0.6192989444333798, - "acc_norm_stderr": 0.004845668799108534 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.030039730592197812 - }, - "winogrande": { - "acc": 0.5974743488555643, - "acc_stderr": 0.013782866831703044 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647673 - }, - "boolq": { - "acc": 0.608868501529052, - "acc_stderr": 0.008535239054221164 - }, - "arc_easy": { - "acc": 0.5989057239057239, - "acc_stderr": 0.010057051106534364, - "acc_norm": 0.5290404040404041, - "acc_norm_stderr": 0.010242463826395614 - }, - "arc_challenge": { - "acc": 0.2815699658703072, - "acc_stderr": 0.013143376735009019, - "acc_norm": 0.29948805460750855, - "acc_norm_stderr": 0.013385021637313562 - }, - "sciq": { - "acc": 0.829, - "acc_stderr": 0.011912216456264604, - "acc_norm": 0.746, - "acc_norm_stderr": 0.01377220656516854 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.01000961195385892, - "acc_norm": 0.7546245919477693, - "acc_norm_stderr": 0.010039831320422386 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..609e33e2a338883e06e3eff3df10be3916bed783 --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932575,0 +anli_r2,acc,0.326,0.014830507204541033,0 +anli_r3,acc,0.3441666666666667,0.013720551062295756,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545794,0 +arc_challenge,acc_norm,0.302901023890785,0.013428241573185349,0 +arc_easy,acc,0.6132154882154882,0.009993308355370968,0 +arc_easy,acc_norm,0.5774410774410774,0.010135978222981071,0 +boolq,acc,0.5685015290519878,0.008662594569027316,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.28917378917378916,,1 +copa,acc,0.78,0.04163331998932262,0 +hellaswag,acc,0.4714200358494324,0.00498162329219619,0 +hellaswag,acc_norm,0.6203943437562238,0.004842969887794082,0 +piqa,acc,0.7464635473340587,0.010150090834551794,0 +piqa,acc_norm,0.749183895538629,0.010113869547069046,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.868,0.010709373963528012,0 +sciq,acc_norm,0.841,0.0115694793682713,0 +storycloze_2016,acc,0.7049706039551042,0.010546232606962283,0 +winogrande,acc,0.5682715074980268,0.01392087211001071,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json deleted file mode 100644 index be8a7a61b332c0c3d82ba0df2da0bd6498b4e4e2..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_1_lm-eval_global_step80108_2023-02-15-11-04-03_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932575 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.014830507204541033 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295756 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.28917378917378916 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932262 - }, - "hellaswag": { - "acc": 0.4714200358494324, - "acc_stderr": 0.00498162329219619, - "acc_norm": 0.6203943437562238, - "acc_norm_stderr": 0.004842969887794082 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.5682715074980268, - "acc_stderr": 0.01392087211001071 - }, - "storycloze_2016": { - "acc": 0.7049706039551042, - "acc_stderr": 0.010546232606962283 - }, - "boolq": { - "acc": 0.5685015290519878, - "acc_stderr": 0.008662594569027316 - }, - "arc_easy": { - "acc": 0.6132154882154882, - "acc_stderr": 0.009993308355370968, - "acc_norm": 0.5774410774410774, - "acc_norm_stderr": 0.010135978222981071 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545794, - "acc_norm": 0.302901023890785, - "acc_norm_stderr": 0.013428241573185349 - }, - "sciq": { - "acc": 0.868, - "acc_stderr": 0.010709373963528012, - "acc_norm": 0.841, - "acc_norm_stderr": 0.0115694793682713 - }, - "piqa": { - "acc": 0.7464635473340587, - "acc_stderr": 0.010150090834551794, - "acc_norm": 0.749183895538629, - "acc_norm_stderr": 0.010113869547069046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..5c5eccb26ff4ce8cd02872a36f08d2cd52eafdd8 --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.331,0.014888272588203945,0 +anli_r2,acc,0.35,0.015090650341444231,0 +anli_r3,acc,0.33416666666666667,0.013622434813136769,0 +arc_challenge,acc,0.28924914675767915,0.013250012579393443,0 +arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0 +arc_easy,acc,0.6102693602693603,0.01000716939179705,0 +arc_easy,acc_norm,0.5993265993265994,0.010055304474255582,0 +boolq,acc,0.5519877675840978,0.008697655510897228,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.26182156999767064,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.468034256124278,0.0049795737655758555,0 +hellaswag,acc_norm,0.6188010356502689,0.00484688692976345,0 +piqa,acc,0.7529923830250272,0.010062268140772622,0 +piqa,acc_norm,0.7584330794341676,0.00998671800180446,0 +rte,acc,0.5342960288808665,0.03002557981936643,0 +sciq,acc,0.883,0.010169287802713329,0 +sciq,acc_norm,0.865,0.010811655372416053,0 +storycloze_2016,acc,0.7151256012827365,0.01043751398661172,0 +winogrande,acc,0.5769534333070244,0.013885055359056472,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json deleted file mode 100644 index 413c8811dd416db6680e567317344625688214aa..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_2_lm-eval_global_step80108_2023-02-15-11-04-03_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.331, - "acc_stderr": 0.014888272588203945 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444231 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136769 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.26182156999767064 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.468034256124278, - "acc_stderr": 0.0049795737655758555, - "acc_norm": 0.6188010356502689, - "acc_norm_stderr": 0.00484688692976345 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.03002557981936643 - }, - "winogrande": { - "acc": 0.5769534333070244, - "acc_stderr": 0.013885055359056472 - }, - "storycloze_2016": { - "acc": 0.7151256012827365, - "acc_stderr": 0.01043751398661172 - }, - "boolq": { - "acc": 0.5519877675840978, - "acc_stderr": 0.008697655510897228 - }, - "arc_easy": { - "acc": 0.6102693602693603, - "acc_stderr": 0.01000716939179705, - "acc_norm": 0.5993265993265994, - "acc_norm_stderr": 0.010055304474255582 - }, - "arc_challenge": { - "acc": 0.28924914675767915, - "acc_stderr": 0.013250012579393443, - "acc_norm": 0.30887372013651876, - "acc_norm_stderr": 0.013501770929344003 - }, - "sciq": { - "acc": 0.883, - "acc_stderr": 0.010169287802713329, - "acc_norm": 0.865, - "acc_norm_stderr": 0.010811655372416053 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772622, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.00998671800180446 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..11a0fa81f9d9f675a9f61eedc58077d0c8b02bb4 --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.321,0.014770821817934645,0 +anli_r2,acc,0.332,0.014899597242811482,0 +anli_r3,acc,0.3425,0.013704669762934728,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.3037542662116041,0.013438909184778757,0 +arc_easy,acc,0.6178451178451179,0.009970747281292436,0 +arc_easy,acc_norm,0.601010101010101,0.010048240683798748,0 +boolq,acc,0.5532110091743119,0.008695392261996192,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2736842105263158,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.46873132842063336,0.004980014536539821,0 +hellaswag,acc_norm,0.6212905795658236,0.0048407422067181065,0 +piqa,acc,0.7557127312295974,0.010024765172284242,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.887,0.010016552866696846,0 +sciq,acc_norm,0.876,0.010427498872343973,0 +storycloze_2016,acc,0.7156600748262961,0.010431614128665253,0 +winogrande,acc,0.5895816890292028,0.013825107120035866,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json deleted file mode 100644 index 79bb26b04cfc4ecb0313063cd1857f4701361198..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_3_lm-eval_global_step80108_2023-02-15-11-04-03_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.321, - "acc_stderr": 0.014770821817934645 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811482 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934728 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2736842105263158 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.46873132842063336, - "acc_stderr": 0.004980014536539821, - "acc_norm": 0.6212905795658236, - "acc_norm_stderr": 0.0048407422067181065 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.013825107120035866 - }, - "storycloze_2016": { - "acc": 0.7156600748262961, - "acc_stderr": 0.010431614128665253 - }, - "boolq": { - "acc": 0.5532110091743119, - "acc_stderr": 0.008695392261996192 - }, - "arc_easy": { - "acc": 0.6178451178451179, - "acc_stderr": 0.009970747281292436, - "acc_norm": 0.601010101010101, - "acc_norm_stderr": 0.010048240683798748 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.3037542662116041, - "acc_norm_stderr": 0.013438909184778757 - }, - "sciq": { - "acc": 0.887, - "acc_stderr": 0.010016552866696846, - "acc_norm": 0.876, - "acc_norm_stderr": 0.010427498872343973 - }, - "piqa": { - "acc": 0.7557127312295974, - "acc_stderr": 0.010024765172284242, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..10a3a235463264406f8364011855d205fda68365 --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.014806864733738856,0 +anli_r2,acc,0.359,0.015177264224798596,0 +anli_r3,acc,0.3466666666666667,0.013744022550571952,0 +arc_challenge,acc,0.27474402730375425,0.013044617212771227,0 +arc_challenge,acc_norm,0.30204778156996587,0.01341751914471642,0 +arc_easy,acc,0.6195286195286195,0.009962305992058577,0 +arc_easy,acc_norm,0.6052188552188552,0.010030038935883598,0 +boolq,acc,0.5278287461773701,0.008731499445069549,1 +cb,acc,0.3392857142857143,0.06384226561930827,1 +cb,f1,0.2379084967320261,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.46793467436765585,0.004979510001776621,0 +hellaswag,acc_norm,0.6213901613224457,0.004840493603166203,0 +piqa,acc,0.7568008705114254,0.010009611953858922,0 +piqa,acc_norm,0.7671381936887922,0.009861236071080757,0 +rte,acc,0.49458483754512633,0.030094698123239966,0 +sciq,acc,0.892,0.0098200016513457,0 +sciq,acc_norm,0.889,0.009938701010583726,0 +storycloze_2016,acc,0.7226082308925709,0.010353267472010767,0 +winogrande,acc,0.5816890292028414,0.013863669961195911,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json deleted file mode 100644 index 73828634b8afe3e81f5246ed2d6ac4377a060881..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_4_lm-eval_global_step80108_2023-02-15-11-04-03_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.014806864733738856 - }, - "anli_r2": { - "acc": 0.359, - "acc_stderr": 0.015177264224798596 - }, - "anli_r3": { - "acc": 0.3466666666666667, - "acc_stderr": 0.013744022550571952 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930827, - "f1": 0.2379084967320261 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.46793467436765585, - "acc_stderr": 0.004979510001776621, - "acc_norm": 0.6213901613224457, - "acc_norm_stderr": 0.004840493603166203 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.5816890292028414, - "acc_stderr": 0.013863669961195911 - }, - "storycloze_2016": { - "acc": 0.7226082308925709, - "acc_stderr": 0.010353267472010767 - }, - "boolq": { - "acc": 0.5278287461773701, - "acc_stderr": 0.008731499445069549 - }, - "arc_easy": { - "acc": 0.6195286195286195, - "acc_stderr": 0.009962305992058577, - "acc_norm": 0.6052188552188552, - "acc_norm_stderr": 0.010030038935883598 - }, - "arc_challenge": { - "acc": 0.27474402730375425, - "acc_stderr": 0.013044617212771227, - "acc_norm": 0.30204778156996587, - "acc_norm_stderr": 0.01341751914471642 - }, - "sciq": { - "acc": 0.892, - "acc_stderr": 0.0098200016513457, - "acc_norm": 0.889, - "acc_norm_stderr": 0.009938701010583726 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858922, - "acc_norm": 0.7671381936887922, - "acc_norm_stderr": 0.009861236071080757 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5.csv b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..84c3107a5fe8514d2c218098bb159310cd9ccb92 --- /dev/null +++ b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620347,0 +anli_r2,acc,0.322,0.014782913600996664,0 +anli_r3,acc,0.35333333333333333,0.013804572162314925,0 +arc_challenge,acc,0.28498293515358364,0.013191348179838793,0 +arc_challenge,acc_norm,0.310580204778157,0.01352229209805305,0 +arc_easy,acc,0.6195286195286195,0.00996230599205857,0 +arc_easy,acc_norm,0.6136363636363636,0.009991296778159615,0 +boolq,acc,0.5290519877675841,0.00873028052845153,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.25089094796863864,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4671380203146783,0.004978992721242829,0 +hellaswag,acc_norm,0.6250746863174667,0.004831142570475509,0 +piqa,acc,0.7453754080522307,0.01016443223706049,0 +piqa,acc_norm,0.7595212187159956,0.009971345364651066,0 +rte,acc,0.5018050541516246,0.030096267148976626,0 +sciq,acc,0.906,0.009233052000787736,0 +sciq,acc_norm,0.894,0.009739551265785133,0 +storycloze_2016,acc,0.7252805986103688,0.010322309878339502,0 +winogrande,acc,0.5832675611681136,0.01385625007279632,0 diff --git a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json b/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json deleted file mode 100644 index fed816de2ab3e53f27886d381f52f6847eb328eb..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed2/evaluation/rankeval/4b284b42bc4seed2_5_lm-eval_global_step80108_2023-02-15-11-04-03_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620347 - }, - "anli_r2": { - "acc": 0.322, - "acc_stderr": 0.014782913600996664 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314925 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.25089094796863864 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4671380203146783, - "acc_stderr": 0.004978992721242829, - "acc_norm": 0.6250746863174667, - "acc_norm_stderr": 0.004831142570475509 - }, - "rte": { - "acc": 0.5018050541516246, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.5832675611681136, - "acc_stderr": 0.01385625007279632 - }, - "storycloze_2016": { - "acc": 0.7252805986103688, - "acc_stderr": 0.010322309878339502 - }, - "boolq": { - "acc": 0.5290519877675841, - "acc_stderr": 0.00873028052845153 - }, - "arc_easy": { - "acc": 0.6195286195286195, - "acc_stderr": 0.00996230599205857, - "acc_norm": 0.6136363636363636, - "acc_norm_stderr": 0.009991296778159615 - }, - "arc_challenge": { - "acc": 0.28498293515358364, - "acc_stderr": 0.013191348179838793, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.01352229209805305 - }, - "sciq": { - "acc": 0.906, - "acc_stderr": 0.009233052000787736, - "acc_norm": 0.894, - "acc_norm_stderr": 0.009739551265785133 - }, - "piqa": { - "acc": 0.7453754080522307, - "acc_stderr": 0.01016443223706049, - "acc_norm": 0.7595212187159956, - "acc_norm_stderr": 0.009971345364651066 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/generation/merged.csv b/4b284b42bc4seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..f4c12b261333636861454c7178ee75e19dbe2a3b --- /dev/null +++ b/4b284b42bc4seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.041429203797191096 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.041429203797191096 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1928054727426135 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1928054727426135 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2229687549240271 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2229687549240271 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23320849920258752 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23320849920258752 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23586940604720674 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23586940604720674 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23981788779086907 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23981788779086907 +e2e_nlg_cleaned,5,average,multiple,0.19434987075074917 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.046916069305408335 +gem_xsum,0,median,rouge2_fmeasure,0.046916069305408335 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03736288266942292 +gem_xsum,1,median,rouge2_fmeasure,0.03736288266942292 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.039942143589958415 +gem_xsum,2,median,rouge2_fmeasure,0.039942143589958415 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03904911931370361 +gem_xsum,3,median,rouge2_fmeasure,0.03904911931370361 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010722887449732417 +gem_xsum,4,median,rouge2_fmeasure,0.010722887449732417 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005993250623691804 +gem_xsum,5,median,rouge2_fmeasure,0.0005993250623691804 +gem_xsum,5,average,multiple,0.02909873789843248 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05033305845129639 +web_nlg_en,0,median,rouge2_fmeasure,0.05033305845129639 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0530528813586448 +web_nlg_en,1,median,rouge2_fmeasure,0.0530528813586448 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.054647342094887884 +web_nlg_en,2,median,rouge2_fmeasure,0.054647342094887884 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.054975410913038446 +web_nlg_en,3,median,rouge2_fmeasure,0.054975410913038446 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05676722017594987 +web_nlg_en,4,median,rouge2_fmeasure,0.05676722017594987 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05651438979578568 +web_nlg_en,5,median,rouge2_fmeasure,0.05651438979578568 +web_nlg_en,5,average,multiple,0.05438171713160051 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.04075668205435305 +wiki_lingua_en,0,median,rouge2_fmeasure,0.04075668205435305 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05309884853711394 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05309884853711394 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05930210668592541 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05930210668592541 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.0517180012203842 +wiki_lingua_en,3,median,rouge2_fmeasure,0.0517180012203842 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01647080810209277 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01647080810209277 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.00278038991283995 +wiki_lingua_en,5,median,rouge2_fmeasure,0.00278038991283995 +wiki_lingua_en,5,average,multiple,0.03735447275211822 diff --git a/4b284b42bc4seed3/evaluation/generation/merged.json b/4b284b42bc4seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..10b48a4b0ac902ac84a6dacfbb27fce43273d585 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.40286648385376755, "bleu_stderr": 0.05212845034450995, "rouge1_fmeasure": 0.10639343053245706, "rouge1_fmeasure_stderr": 0.0021073511307359532, "rouge1_precision": 0.06996098838307302, "rouge1_precision_stderr": 0.0015898449832407893, "rouge1_recall": 0.29442884741673947, "rouge1_recall_stderr": 0.004657771756760718, "rouge2_fmeasure": 0.05033305845129639, "rouge2_fmeasure_stderr": 0.0013249908270922567, "rouge2_precision": 0.03294981919147428, "rouge2_precision_stderr": 0.0009663398363788032, "rouge2_recall": 0.14246175839945485, "rouge2_recall_stderr": 0.0031909835354524702, "rougeL_fmeasure": 0.10235134870489063, "rougeL_fmeasure_stderr": 0.0019273341529169933, "rougeL_precision": 0.06695896997261808, "rougeL_precision_stderr": 0.0014202002477843082, "rougeL_recall": 0.28646151455290875, "rougeL_recall_stderr": 0.004529876020974902, "rougeLsum_fmeasure": 0.10161495575656154, "rougeLsum_fmeasure_stderr": 0.0019693633168815267, "rougeLsum_precision": 0.0667289062013898, "rougeLsum_precision_stderr": 0.001476878097104591, "rougeLsum_recall": 0.2816381384137136, "rougeLsum_recall_stderr": 0.00440124224176318}}, "1": {"PALM_prompt": {"bleu": 0.5379028926508614, "bleu_stderr": 0.03687531363084996, "rouge1_fmeasure": 0.11471205984296223, "rouge1_fmeasure_stderr": 0.0018805014434852309, "rouge1_precision": 0.07309631370592509, "rouge1_precision_stderr": 0.0013732592923049792, "rouge1_recall": 0.37028400918910054, "rouge1_recall_stderr": 0.005457409473033004, "rouge2_fmeasure": 0.0530528813586448, "rouge2_fmeasure_stderr": 0.001199555498269871, "rouge2_precision": 0.03377367444978376, "rouge2_precision_stderr": 0.0008548764812808871, "rouge2_recall": 0.17908798528025902, "rouge2_recall_stderr": 0.0037028704616034083, "rougeL_fmeasure": 0.10809733095164265, "rougeL_fmeasure_stderr": 0.0016925101881231047, "rougeL_precision": 0.0688117484703189, "rougeL_precision_stderr": 0.001222213532774839, "rougeL_recall": 0.3480091072970109, "rougeL_recall_stderr": 0.004950245404519997, "rougeLsum_fmeasure": 0.10846385734949525, "rougeLsum_fmeasure_stderr": 0.0017609033136198704, "rougeLsum_precision": 0.0692047577374815, "rougeLsum_precision_stderr": 0.0012923071491164684, "rougeLsum_recall": 0.34853415419811595, "rougeLsum_recall_stderr": 0.004966041933763933}}, "2": {"PALM_prompt": {"bleu": 0.5942366626957291, "bleu_stderr": 0.025627212989091184, "rouge1_fmeasure": 0.11889915113239362, "rouge1_fmeasure_stderr": 0.0017198201554726746, "rouge1_precision": 0.07493247583805844, "rouge1_precision_stderr": 0.0012395113463601354, "rouge1_recall": 0.4039521622707045, "rouge1_recall_stderr": 0.005434274713783703, "rouge2_fmeasure": 0.054647342094887884, "rouge2_fmeasure_stderr": 0.0010921234226901342, "rouge2_precision": 0.034282050377834015, "rouge2_precision_stderr": 0.0007600503732258525, "rouge2_recall": 0.1991669602854511, "rouge2_recall_stderr": 0.003931080841294317, "rougeL_fmeasure": 0.10982111537163444, "rougeL_fmeasure_stderr": 0.0015404704299286365, "rougeL_precision": 0.06925387716093144, "rougeL_precision_stderr": 0.001112414391402196, "rougeL_recall": 0.3720868248461876, "rougeL_recall_stderr": 0.004894890409864309, "rougeLsum_fmeasure": 0.11238895371903196, "rougeLsum_fmeasure_stderr": 0.0016119226692343728, "rougeLsum_precision": 0.07089026453871902, "rougeLsum_precision_stderr": 0.0011645544140396038, "rougeLsum_recall": 0.38071119224279054, "rougeLsum_recall_stderr": 0.00501969435495313}}, "3": {"PALM_prompt": {"bleu": 0.6679262074421776, "bleu_stderr": 0.03333067116771941, "rouge1_fmeasure": 0.12006524308183385, "rouge1_fmeasure_stderr": 0.0016814614065816674, "rouge1_precision": 0.07536480662831223, "rouge1_precision_stderr": 0.0011983446313843626, "rouge1_recall": 0.4126495888007212, "rouge1_recall_stderr": 0.005516411154661876, "rouge2_fmeasure": 0.054975410913038446, "rouge2_fmeasure_stderr": 0.001083918785034363, "rouge2_precision": 0.03435429282776522, "rouge2_precision_stderr": 0.0007469545646988583, "rouge2_recall": 0.2021206807513725, "rouge2_recall_stderr": 0.003937737161421917, "rougeL_fmeasure": 0.11006804581376863, "rougeL_fmeasure_stderr": 0.001493830571926201, "rougeL_precision": 0.06923780115211091, "rougeL_precision_stderr": 0.001077323853164496, "rougeL_recall": 0.37665280691221614, "rougeL_recall_stderr": 0.004882350067942863, "rougeLsum_fmeasure": 0.1129753647300024, "rougeLsum_fmeasure_stderr": 0.001577915948877365, "rougeLsum_precision": 0.07103697236204776, "rougeLsum_precision_stderr": 0.0011338290169387135, "rougeLsum_recall": 0.38640310742292994, "rougeLsum_recall_stderr": 0.005040562618951275}}, "4": {"PALM_prompt": {"bleu": 0.6769067627224955, "bleu_stderr": 0.0461104671558504, "rouge1_fmeasure": 0.12273630978790588, "rouge1_fmeasure_stderr": 0.001663341770893685, "rouge1_precision": 0.07698161606160427, "rouge1_precision_stderr": 0.0011812041716973312, "rouge1_recall": 0.4169767197057726, "rouge1_recall_stderr": 0.005368256764036794, "rouge2_fmeasure": 0.05676722017594987, "rouge2_fmeasure_stderr": 0.0010551520395745532, "rouge2_precision": 0.03535772650403365, "rouge2_precision_stderr": 0.0007212057820482756, "rouge2_recall": 0.20970750904696342, "rouge2_recall_stderr": 0.0039457537348272925, "rougeL_fmeasure": 0.1122867012912741, "rougeL_fmeasure_stderr": 0.0014682525659690917, "rougeL_precision": 0.07053008542469329, "rougeL_precision_stderr": 0.001056451040385872, "rougeL_recall": 0.38052456768133897, "rougeL_recall_stderr": 0.004711823160775788, "rougeLsum_fmeasure": 0.11624170216768001, "rougeLsum_fmeasure_stderr": 0.0015625017951151554, "rougeLsum_precision": 0.07296806904648159, "rougeLsum_precision_stderr": 0.0011161127148090057, "rougeLsum_recall": 0.39425404712648704, "rougeLsum_recall_stderr": 0.004952931195655136}}, "5": {"PALM_prompt": {"bleu": 0.7298388215986955, "bleu_stderr": 0.03410846938230183, "rouge1_fmeasure": 0.12306547329071646, "rouge1_fmeasure_stderr": 0.0016005194094492465, "rouge1_precision": 0.07695562502104737, "rouge1_precision_stderr": 0.001146538309303851, "rouge1_recall": 0.42572244510998153, "rouge1_recall_stderr": 0.005292556910488358, "rouge2_fmeasure": 0.05651438979578568, "rouge2_fmeasure_stderr": 0.001041479745584778, "rouge2_precision": 0.035072722310952494, "rouge2_precision_stderr": 0.0007144692727423962, "rouge2_recall": 0.21311707552350648, "rouge2_recall_stderr": 0.003973886927101672, "rougeL_fmeasure": 0.11208456880715004, "rougeL_fmeasure_stderr": 0.0014454570151403556, "rougeL_precision": 0.07024401142170024, "rougeL_precision_stderr": 0.001051262962385294, "rougeL_recall": 0.3855636673236153, "rougeL_recall_stderr": 0.004614982713566353, "rougeLsum_fmeasure": 0.11597908604907735, "rougeLsum_fmeasure_stderr": 0.001497182042775692, "rougeLsum_precision": 0.07259500178551292, "rougeLsum_precision_stderr": 0.0010804144644452327, "rougeLsum_recall": 0.39997726550161855, "rougeLsum_recall_stderr": 0.0048313269402151205}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.7892535258267361, "bleu_stderr": 0.07376152555399827, "rouge1_fmeasure": 0.18718732854154974, "rouge1_fmeasure_stderr": 0.0018924487998349612, "rouge1_precision": 0.15969051276064813, "rouge1_precision_stderr": 0.001942883143586728, "rouge1_recall": 0.27343051217750347, "rouge1_recall_stderr": 0.0027587854784945757, "rouge2_fmeasure": 0.04075668205435305, "rouge2_fmeasure_stderr": 0.0009022201823907085, "rouge2_precision": 0.03457636027289612, "rouge2_precision_stderr": 0.0008049732049233189, "rouge2_recall": 0.061281361460635966, "rouge2_recall_stderr": 0.0014971431420006567, "rougeL_fmeasure": 0.1452400681084372, "rougeL_fmeasure_stderr": 0.0013211415113596292, "rougeL_precision": 0.12243858898167204, "rougeL_precision_stderr": 0.0013264585867582177, "rougeL_recall": 0.21719592340715815, "rougeL_recall_stderr": 0.002198877203255072, "rougeLsum_fmeasure": 0.1717276627498721, "rougeLsum_fmeasure_stderr": 0.0017249210773351896, "rougeLsum_precision": 0.14631477056240805, "rougeLsum_precision_stderr": 0.0017693856980057917, "rougeLsum_recall": 0.25167431554385267, "rougeLsum_recall_stderr": 0.0025519675545699265}}, "1": {"tldr_en": {"bleu": 2.7313763312803463, "bleu_stderr": 0.06431841398930878, "rouge1_fmeasure": 0.2179019106488729, "rouge1_fmeasure_stderr": 0.001914589747533169, "rouge1_precision": 0.19337653863894458, "rouge1_precision_stderr": 0.0022789610297058423, "rouge1_recall": 0.31342249510279335, "rouge1_recall_stderr": 0.0027451555547873163, "rouge2_fmeasure": 0.05309884853711394, "rouge2_fmeasure_stderr": 0.0010362274557463985, "rouge2_precision": 0.04767366659499796, "rouge2_precision_stderr": 0.0010966119115502326, "rouge2_recall": 0.07815815946901311, "rouge2_recall_stderr": 0.0016866799115371686, "rougeL_fmeasure": 0.1573437634028076, "rougeL_fmeasure_stderr": 0.0013081487267953859, "rougeL_precision": 0.13876745029470247, "rougeL_precision_stderr": 0.0016193053665115018, "rougeL_recall": 0.23180330032633759, "rougeL_recall_stderr": 0.0021871697423696415, "rougeLsum_fmeasure": 0.20404526770912643, "rougeLsum_fmeasure_stderr": 0.0017913673119406505, "rougeLsum_precision": 0.18110312506761292, "rougeLsum_precision_stderr": 0.002148924896412393, "rougeLsum_recall": 0.2939865838321714, "rougeLsum_recall_stderr": 0.0026003070024099395}}, "2": {"tldr_en": {"bleu": 3.2706626686468483, "bleu_stderr": 0.05895243674760552, "rouge1_fmeasure": 0.22655355604347935, "rouge1_fmeasure_stderr": 0.0019575369428529144, "rouge1_precision": 0.2378942905907137, "rouge1_precision_stderr": 0.0031497210011433496, "rouge1_recall": 0.30323691861478563, "rouge1_recall_stderr": 0.002907309508940713, "rouge2_fmeasure": 0.05930210668592541, "rouge2_fmeasure_stderr": 0.0011620048939953744, "rouge2_precision": 0.06661871177427002, "rouge2_precision_stderr": 0.0018856657162201126, "rouge2_recall": 0.0803263511266704, "rouge2_recall_stderr": 0.0017407744409257712, "rougeL_fmeasure": 0.16378420244346403, "rougeL_fmeasure_stderr": 0.0014079706838807605, "rougeL_precision": 0.17510717603427448, "rougeL_precision_stderr": 0.002585888506583138, "rougeL_recall": 0.22190182805565306, "rougeL_recall_stderr": 0.0022886872280766754, "rougeLsum_fmeasure": 0.21397004830277258, "rougeLsum_fmeasure_stderr": 0.001847756158136501, "rougeLsum_precision": 0.22473899823994106, "rougeLsum_precision_stderr": 0.0030077620086430773, "rougeLsum_recall": 0.28698298853858273, "rougeLsum_recall_stderr": 0.0027715872137367885}}, "3": {"tldr_en": {"bleu": 3.625995619755928, "bleu_stderr": 0.08151204101758054, "rouge1_fmeasure": 0.18879655923916508, "rouge1_fmeasure_stderr": 0.002417829019868095, "rouge1_precision": 0.22797269883915738, "rouge1_precision_stderr": 0.0038340085779668192, "rouge1_recall": 0.23645930801545356, "rouge1_recall_stderr": 0.003376410271713712, "rouge2_fmeasure": 0.0517180012203842, "rouge2_fmeasure_stderr": 0.0012108106278924731, "rouge2_precision": 0.06756596682222428, "rouge2_precision_stderr": 0.0021738332098718894, "rouge2_recall": 0.06552475514042137, "rouge2_recall_stderr": 0.0016796922078675188, "rougeL_fmeasure": 0.13925720702220254, "rougeL_fmeasure_stderr": 0.0018062024998216275, "rougeL_precision": 0.17257640354449272, "rougeL_precision_stderr": 0.0031587073849980592, "rougeL_recall": 0.1755452238104328, "rougeL_recall_stderr": 0.0026310211489849133, "rougeLsum_fmeasure": 0.17894024147930437, "rougeLsum_fmeasure_stderr": 0.002304299372894341, "rougeLsum_precision": 0.21606433484698467, "rougeLsum_precision_stderr": 0.00365906632460656, "rougeLsum_recall": 0.22396544472382549, "rougeLsum_recall_stderr": 0.003215244567646662}}, "4": {"tldr_en": {"bleu": 0.32276000238250857, "bleu_stderr": 0.04122539509538276, "rouge1_fmeasure": 0.05877235025374831, "rouge1_fmeasure_stderr": 0.0020757714328794993, "rouge1_precision": 0.07359995764515682, "rouge1_precision_stderr": 0.002967980885850916, "rouge1_recall": 0.07437337044894303, "rouge1_recall_stderr": 0.002771566352869776, "rouge2_fmeasure": 0.01647080810209277, "rouge2_fmeasure_stderr": 0.0008494024468294314, "rouge2_precision": 0.022811544918519226, "rouge2_precision_stderr": 0.0014875286720145739, "rouge2_recall": 0.021443597362593173, "rouge2_recall_stderr": 0.0012035793544498154, "rougeL_fmeasure": 0.044744539055611814, "rougeL_fmeasure_stderr": 0.0015882377996984806, "rougeL_precision": 0.05769488309593311, "rougeL_precision_stderr": 0.00243974941628649, "rougeL_recall": 0.05662359971916526, "rougeL_recall_stderr": 0.0021434534763578057, "rougeLsum_fmeasure": 0.055593054042710664, "rougeLsum_fmeasure_stderr": 0.0019668083119517665, "rougeLsum_precision": 0.06995076302877311, "rougeLsum_precision_stderr": 0.0028399207563645236, "rougeLsum_recall": 0.07020001543089976, "rougeLsum_recall_stderr": 0.002619552306226126}}, "5": {"tldr_en": {"bleu": 1.3484611407865652e-09, "bleu_stderr": 8.443352035948435e-09, "rouge1_fmeasure": 0.009114612973823273, "rouge1_fmeasure_stderr": 0.0009223542254736345, "rouge1_precision": 0.01239315444106573, "rouge1_precision_stderr": 0.0014363410021848279, "rouge1_recall": 0.011383300045857872, "rouge1_recall_stderr": 0.0012173762561540143, "rouge2_fmeasure": 0.00278038991283995, "rouge2_fmeasure_stderr": 0.00038473075637041824, "rouge2_precision": 0.0045302531996229065, "rouge2_precision_stderr": 0.0008330565796211205, "rouge2_recall": 0.003576470172001708, "rouge2_recall_stderr": 0.0005588651622667037, "rougeL_fmeasure": 0.007155949619192927, "rougeL_fmeasure_stderr": 0.0007211501481932415, "rougeL_precision": 0.01001834697102603, "rougeL_precision_stderr": 0.0012120937969546158, "rougeL_recall": 0.008985972581881612, "rougeL_recall_stderr": 0.0009837080918219663, "rougeLsum_fmeasure": 0.008540877676789025, "rougeLsum_fmeasure_stderr": 0.0008635134486980147, "rougeLsum_precision": 0.011636645385352324, "rougeLsum_precision_stderr": 0.001366681246596602, "rougeLsum_recall": 0.010738557235201493, "rougeLsum_recall_stderr": 0.001155987282700236}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.693401837200285, "bleu_stderr": 0.08763868635746681, "rouge1_fmeasure": 0.16611297494282404, "rouge1_fmeasure_stderr": 0.00159680296784645, "rouge1_precision": 0.16579904997386866, "rouge1_precision_stderr": 0.002186020056914844, "rouge1_recall": 0.22642045526757837, "rouge1_recall_stderr": 0.0023687905832833014, "rouge2_fmeasure": 0.041429203797191096, "rouge2_fmeasure_stderr": 0.0010121278902232196, "rouge2_precision": 0.03388945696322452, "rouge2_precision_stderr": 0.0008651029489382238, "rouge2_recall": 0.060593946509437495, "rouge2_recall_stderr": 0.0014889438447109118, "rougeL_fmeasure": 0.15663336526066807, "rougeL_fmeasure_stderr": 0.0013865257867417336, "rougeL_precision": 0.15530426027054353, "rougeL_precision_stderr": 0.0019863912296353655, "rougeL_recall": 0.21503755577893796, "rougeL_recall_stderr": 0.00211789398504925, "rougeLsum_fmeasure": 0.1428371423325733, "rougeLsum_fmeasure_stderr": 0.0014402421725265263, "rougeLsum_precision": 0.1450920759219515, "rougeLsum_precision_stderr": 0.002087614148443835, "rougeLsum_recall": 0.1931878795136461, "rougeLsum_recall_stderr": 0.0020747007413996115}}, "1": {"generate_text_restaurant": {"bleu": 10.811542079735363, "bleu_stderr": 0.15162454275554624, "rouge1_fmeasure": 0.42854660449035686, "rouge1_fmeasure_stderr": 0.0022746952803792044, "rouge1_precision": 0.5137013573524062, "rouge1_precision_stderr": 0.00319258683563529, "rouge1_recall": 0.40636878171094676, "rouge1_recall_stderr": 0.002882672968645596, "rouge2_fmeasure": 0.1928054727426135, "rouge2_fmeasure_stderr": 0.0019038284121157692, "rouge2_precision": 0.23534073778557085, "rouge2_precision_stderr": 0.0025715857623083557, "rouge2_recall": 0.18258609223833502, "rouge2_recall_stderr": 0.002046539791218941, "rougeL_fmeasure": 0.3113788779508062, "rougeL_fmeasure_stderr": 0.0019774235562405734, "rougeL_precision": 0.3762979694307179, "rougeL_precision_stderr": 0.0028693975057274054, "rougeL_recall": 0.2944414188627779, "rougeL_recall_stderr": 0.002337120458337281, "rougeLsum_fmeasure": 0.3498432663788988, "rougeLsum_fmeasure_stderr": 0.0022315022947435688, "rougeLsum_precision": 0.4205624013999805, "rougeLsum_precision_stderr": 0.003079312427902136, "rougeLsum_recall": 0.3313519491919322, "rougeLsum_recall_stderr": 0.002639246783373194}}, "2": {"generate_text_restaurant": {"bleu": 12.803942144601116, "bleu_stderr": 0.21618040146131992, "rouge1_fmeasure": 0.4611513176254791, "rouge1_fmeasure_stderr": 0.002199995367365474, "rouge1_precision": 0.5538907219965342, "rouge1_precision_stderr": 0.0032612294164257216, "rouge1_recall": 0.4350995198760696, "rouge1_recall_stderr": 0.002856610829509688, "rouge2_fmeasure": 0.2229687549240271, "rouge2_fmeasure_stderr": 0.0019710523597241715, "rouge2_precision": 0.2729320100739506, "rouge2_precision_stderr": 0.0027234503513028584, "rouge2_recall": 0.2101696343227206, "rouge2_recall_stderr": 0.002150285923735445, "rougeL_fmeasure": 0.3416818930898736, "rougeL_fmeasure_stderr": 0.0020246870873676555, "rougeL_precision": 0.4133741500218145, "rougeL_precision_stderr": 0.003020936241100034, "rougeL_recall": 0.32145908492422065, "rougeL_recall_stderr": 0.0023969661475136306, "rougeLsum_fmeasure": 0.38257616765715247, "rougeLsum_fmeasure_stderr": 0.0022246263334132453, "rougeLsum_precision": 0.4611079827648602, "rougeLsum_precision_stderr": 0.0032149008542871746, "rougeLsum_recall": 0.36026749805953084, "rougeLsum_recall_stderr": 0.002648080757717859}}, "3": {"generate_text_restaurant": {"bleu": 13.549419526938319, "bleu_stderr": 0.1855065278827089, "rouge1_fmeasure": 0.47156037863176303, "rouge1_fmeasure_stderr": 0.0021582661393463962, "rouge1_precision": 0.5622765325560521, "rouge1_precision_stderr": 0.0032287138496162934, "rouge1_recall": 0.44435519723608424, "rouge1_recall_stderr": 0.002801797047585696, "rouge2_fmeasure": 0.23320849920258752, "rouge2_fmeasure_stderr": 0.002003019136182861, "rouge2_precision": 0.2824005913951133, "rouge2_precision_stderr": 0.002719274951851043, "rouge2_recall": 0.21987553093336418, "rouge2_recall_stderr": 0.002184214297734334, "rougeL_fmeasure": 0.35119690753799093, "rougeL_fmeasure_stderr": 0.002078338911409397, "rougeL_precision": 0.42050656145422816, "rougeL_precision_stderr": 0.0030059607815769765, "rougeL_recall": 0.330473027799321, "rougeL_recall_stderr": 0.0024402629244160393, "rougeLsum_fmeasure": 0.394476669084247, "rougeLsum_fmeasure_stderr": 0.0022641630943918186, "rougeLsum_precision": 0.4706616514354793, "rougeLsum_precision_stderr": 0.003192105043156649, "rougeLsum_recall": 0.37155323262077417, "rougeLsum_recall_stderr": 0.0026779306873817056}}, "4": {"generate_text_restaurant": {"bleu": 13.824091400241928, "bleu_stderr": 0.13104091370071627, "rouge1_fmeasure": 0.4729227008920987, "rouge1_fmeasure_stderr": 0.002173390089402513, "rouge1_precision": 0.559655152378256, "rouge1_precision_stderr": 0.00321686393289185, "rouge1_recall": 0.4463184732215886, "rouge1_recall_stderr": 0.002756272749754986, "rouge2_fmeasure": 0.23586940604720674, "rouge2_fmeasure_stderr": 0.0020025387806477473, "rouge2_precision": 0.2840232577174272, "rouge2_precision_stderr": 0.00271181738495325, "rouge2_recall": 0.2222491185021198, "rouge2_recall_stderr": 0.0021623586075651146, "rougeL_fmeasure": 0.35202027805933056, "rougeL_fmeasure_stderr": 0.002092564908854137, "rougeL_precision": 0.4180629283082632, "rougeL_precision_stderr": 0.0029877481411599237, "rougeL_recall": 0.33196107712218637, "rougeL_recall_stderr": 0.0024311143031782462, "rougeLsum_fmeasure": 0.39613612180447344, "rougeLsum_fmeasure_stderr": 0.002266152079884005, "rougeLsum_precision": 0.4688463943976415, "rougeLsum_precision_stderr": 0.0031530760765321546, "rougeLsum_recall": 0.37409621129154014, "rougeLsum_recall_stderr": 0.0026785522822089635}}, "5": {"generate_text_restaurant": {"bleu": 14.200380998337229, "bleu_stderr": 0.19329384114815742, "rouge1_fmeasure": 0.47679187260022104, "rouge1_fmeasure_stderr": 0.002154333673818338, "rouge1_precision": 0.5598013517043776, "rouge1_precision_stderr": 0.0031952209265306724, "rouge1_recall": 0.4515552018520872, "rouge1_recall_stderr": 0.0027541134409545752, "rouge2_fmeasure": 0.23981788779086907, "rouge2_fmeasure_stderr": 0.0020141420899880426, "rouge2_precision": 0.2868996527085169, "rouge2_precision_stderr": 0.0027439839572370225, "rouge2_recall": 0.22658843899426176, "rouge2_recall_stderr": 0.002166189730967118, "rougeL_fmeasure": 0.3553964547201032, "rougeL_fmeasure_stderr": 0.0020748384854769424, "rougeL_precision": 0.4188961211054761, "rougeL_precision_stderr": 0.0029753067970391253, "rougeL_recall": 0.33630944752712066, "rougeL_recall_stderr": 0.002420426022345789, "rougeLsum_fmeasure": 0.40177021275924996, "rougeLsum_fmeasure_stderr": 0.0022697719817145177, "rougeLsum_precision": 0.47218722229605625, "rougeLsum_precision_stderr": 0.003172746693556059, "rougeLsum_recall": 0.3804288121576126, "rougeLsum_recall_stderr": 0.0026714741710499577}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.0960641205542827, "bleu_stderr": 0.06798641040244616, "rouge1_fmeasure": 0.20476726427909248, "rouge1_fmeasure_stderr": 0.002649474133903418, "rouge1_precision": 0.15842344794238317, "rouge1_precision_stderr": 0.002372031267709981, "rouge1_recall": 0.3284952182299337, "rouge1_recall_stderr": 0.0044298556614964756, "rouge2_fmeasure": 0.046916069305408335, "rouge2_fmeasure_stderr": 0.0017197355481914094, "rouge2_precision": 0.03540871760128505, "rouge2_precision_stderr": 0.0014181321683695847, "rouge2_recall": 0.07816166775968982, "rouge2_recall_stderr": 0.0028237456627397046, "rougeL_fmeasure": 0.15749123183599076, "rougeL_fmeasure_stderr": 0.002073944694954327, "rougeL_precision": 0.12176908432033119, "rougeL_precision_stderr": 0.0018710373562779625, "rougeL_recall": 0.2541701545006927, "rougeL_recall_stderr": 0.003551902432183393, "rougeLsum_fmeasure": 0.16236856321055648, "rougeLsum_fmeasure_stderr": 0.002283002778052774, "rougeLsum_precision": 0.12532539040842194, "rougeLsum_precision_stderr": 0.0019917639332936807, "rougeLsum_recall": 0.26232198349291647, "rougeLsum_recall_stderr": 0.003932145235326188}}, "1": {"article_DOC_summary": {"bleu": 1.4696760073708208, "bleu_stderr": 0.06265270441243473, "rouge1_fmeasure": 0.17891974543666367, "rouge1_fmeasure_stderr": 0.0024706510302881872, "rouge1_precision": 0.1271937006393597, "rouge1_precision_stderr": 0.0018275474060210689, "rouge1_recall": 0.3135496885733734, "rouge1_recall_stderr": 0.004242917655669163, "rouge2_fmeasure": 0.03736288266942292, "rouge2_fmeasure_stderr": 0.0014220403164735326, "rouge2_precision": 0.026267000271180424, "rouge2_precision_stderr": 0.0010006504874047221, "rouge2_recall": 0.06722812716641016, "rouge2_recall_stderr": 0.002624780376748357, "rougeL_fmeasure": 0.14017165769151607, "rougeL_fmeasure_stderr": 0.0018633298526103481, "rougeL_precision": 0.09950647531372532, "rougeL_precision_stderr": 0.001366245247346246, "rougeL_recall": 0.24678392130953006, "rougeL_recall_stderr": 0.0033174561386390567, "rougeLsum_fmeasure": 0.1428074680374857, "rougeLsum_fmeasure_stderr": 0.002053207964484963, "rougeLsum_precision": 0.1012924090593187, "rougeLsum_precision_stderr": 0.0014963465244654786, "rougeLsum_recall": 0.2518765845772237, "rougeLsum_recall_stderr": 0.0036614448966502135}}, "2": {"article_DOC_summary": {"bleu": 1.5439003664247115, "bleu_stderr": 0.10851669022401786, "rouge1_fmeasure": 0.18113319848969633, "rouge1_fmeasure_stderr": 0.0024823291925392898, "rouge1_precision": 0.12914677410683195, "rouge1_precision_stderr": 0.001846681090538485, "rouge1_recall": 0.31563927705067746, "rouge1_recall_stderr": 0.004211671574972399, "rouge2_fmeasure": 0.039942143589958415, "rouge2_fmeasure_stderr": 0.0014636503235895269, "rouge2_precision": 0.028246308499238097, "rouge2_precision_stderr": 0.001039777781254331, "rouge2_recall": 0.07096061694974337, "rouge2_recall_stderr": 0.0026636364744364984, "rougeL_fmeasure": 0.1456603788587842, "rougeL_fmeasure_stderr": 0.0019174980062219666, "rougeL_precision": 0.10371573983765109, "rougeL_precision_stderr": 0.0014182980426936482, "rougeL_recall": 0.2550751527972976, "rougeL_recall_stderr": 0.00337783485571472, "rougeLsum_fmeasure": 0.14257911934085393, "rougeLsum_fmeasure_stderr": 0.0020437479521562105, "rougeLsum_precision": 0.10139018090786547, "rougeLsum_precision_stderr": 0.0014984047207110595, "rougeLsum_recall": 0.25022303799687573, "rougeLsum_recall_stderr": 0.0035989374972674425}}, "3": {"article_DOC_summary": {"bleu": 1.6145690136458937, "bleu_stderr": 0.08805787248504932, "rouge1_fmeasure": 0.17680602744097074, "rouge1_fmeasure_stderr": 0.002631980246229425, "rouge1_precision": 0.1296338781704996, "rouge1_precision_stderr": 0.0021646982503126387, "rouge1_recall": 0.3021070098062078, "rouge1_recall_stderr": 0.004505088335872746, "rouge2_fmeasure": 0.03904911931370361, "rouge2_fmeasure_stderr": 0.0014830525218956757, "rouge2_precision": 0.028309377178479218, "rouge2_precision_stderr": 0.0011164748559469404, "rouge2_recall": 0.06851024845447631, "rouge2_recall_stderr": 0.0026775539108549653, "rougeL_fmeasure": 0.14303319341608928, "rougeL_fmeasure_stderr": 0.0021220160049417853, "rougeL_precision": 0.10464417400111554, "rougeL_precision_stderr": 0.0017426786877401284, "rougeL_recall": 0.2455380323796214, "rougeL_recall_stderr": 0.003722345532650257, "rougeLsum_fmeasure": 0.13901962932381862, "rougeLsum_fmeasure_stderr": 0.002140626268901333, "rougeLsum_precision": 0.10170185371430301, "rougeLsum_precision_stderr": 0.001760814642264659, "rougeLsum_recall": 0.23909447163984915, "rougeLsum_recall_stderr": 0.0037550549904903216}}, "4": {"article_DOC_summary": {"bleu": 0.8494731376465096, "bleu_stderr": 0.15759913012237653, "rouge1_fmeasure": 0.049265793004777556, "rouge1_fmeasure_stderr": 0.0027540940646298335, "rouge1_precision": 0.044098357705247684, "rouge1_precision_stderr": 0.002947936975335653, "rouge1_recall": 0.07606084813555343, "rouge1_recall_stderr": 0.004337933086452851, "rouge2_fmeasure": 0.010722887449732417, "rouge2_fmeasure_stderr": 0.0009552037220813737, "rouge2_precision": 0.010194511397550707, "rouge2_precision_stderr": 0.0014805264810943113, "rouge2_recall": 0.01713730518568552, "rouge2_recall_stderr": 0.0015368918618672114, "rougeL_fmeasure": 0.039633473182401346, "rougeL_fmeasure_stderr": 0.0022076800838754865, "rougeL_precision": 0.036289652305464506, "rougeL_precision_stderr": 0.002595205368225138, "rougeL_recall": 0.06125010427741984, "rougeL_recall_stderr": 0.0035011808908862285, "rougeLsum_fmeasure": 0.039426934015668255, "rougeLsum_fmeasure_stderr": 0.002220814881451182, "rougeLsum_precision": 0.036220881484227876, "rougeLsum_precision_stderr": 0.0026089772559001233, "rougeLsum_recall": 0.06097365769946667, "rougeLsum_recall_stderr": 0.0035434576624889095}}, "5": {"article_DOC_summary": {"bleu": 4.572852757384441e-37, "bleu_stderr": 8.34242498549966e-32, "rouge1_fmeasure": 0.003121651270370141, "rouge1_fmeasure_stderr": 0.000904884711931905, "rouge1_precision": 0.0034796910548998868, "rouge1_precision_stderr": 0.0010310566863858106, "rouge1_recall": 0.003032491272466143, "rouge1_recall_stderr": 0.0008992364877230313, "rouge2_fmeasure": 0.0005993250623691804, "rouge2_fmeasure_stderr": 0.0002662375423658333, "rouge2_precision": 0.0006845648493817196, "rouge2_precision_stderr": 0.00029577448238137655, "rouge2_recall": 0.0005517517074120848, "rouge2_recall_stderr": 0.000253583231847424, "rougeL_fmeasure": 0.0023439628096121445, "rougeL_fmeasure_stderr": 0.0007012918571879311, "rougeL_precision": 0.002635557548227681, "rougeL_precision_stderr": 0.0008071763549733777, "rougeL_recall": 0.0022333458685090783, "rougeL_recall_stderr": 0.0006677775933364694, "rougeLsum_fmeasure": 0.002638144025655543, "rougeLsum_fmeasure_stderr": 0.0007785517533749153, "rougeLsum_precision": 0.002932835738490274, "rougeLsum_precision_stderr": 0.000884369578355545, "rougeLsum_recall": 0.0025885022906791634, "rougeLsum_recall_stderr": 0.0007930485746239223}}}} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..61c85f66993dbe4233d3ae6309ac93dfc9052039 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.337,0.014955087918653603,0 +anli_r2,acc,0.343,0.015019206922356953,0 +anli_r3,acc,0.3425,0.013704669762934728,0 +arc_challenge,acc,0.29436860068259385,0.013318528460539422,0 +arc_challenge,acc_norm,0.3046075085324232,0.01344952210993249,0 +arc_easy,acc,0.6102693602693603,0.010007169391797053,0 +arc_easy,acc_norm,0.5315656565656566,0.010239317603199509,0 +boolq,acc,0.5525993883792049,0.008696530539281539,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.19555555555555557,,1 +copa,acc,0.78,0.04163331998932261,0 +hellaswag,acc,0.47868950408285205,0.00498524726030409,0 +hellaswag,acc_norm,0.6269667396932882,0.004826224784850451,0 +piqa,acc,0.7578890097932536,0.009994371269104387,0 +piqa,acc_norm,0.7611534276387377,0.009948120385337484,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.851,0.011266140684632168,0 +sciq,acc_norm,0.76,0.013512312258920831,0 +storycloze_2016,acc,0.7177979690005345,0.010407834479647672,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json deleted file mode 100644 index 1f0552cf68acdc405e2b11e3d1fe6c1796d318fa..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_0_lm-eval_global_step80108_2023-02-24-21-45-54_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.014955087918653603 - }, - "anli_r2": { - "acc": 0.343, - "acc_stderr": 0.015019206922356953 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934728 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.19555555555555557 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.04163331998932261 - }, - "hellaswag": { - "acc": 0.47868950408285205, - "acc_stderr": 0.00498524726030409, - "acc_norm": 0.6269667396932882, - "acc_norm_stderr": 0.004826224784850451 - }, - "rte": { - "acc": 0.5090252707581228, - "acc_stderr": 0.030091559826331334 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647672 - }, - "boolq": { - "acc": 0.5525993883792049, - "acc_stderr": 0.008696530539281539 - }, - "arc_easy": { - "acc": 0.6102693602693603, - "acc_stderr": 0.010007169391797053, - "acc_norm": 0.5315656565656566, - "acc_norm_stderr": 0.010239317603199509 - }, - "arc_challenge": { - "acc": 0.29436860068259385, - "acc_stderr": 0.013318528460539422, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.01344952210993249 - }, - "sciq": { - "acc": 0.851, - "acc_stderr": 0.011266140684632168, - "acc_norm": 0.76, - "acc_norm_stderr": 0.013512312258920831 - }, - "piqa": { - "acc": 0.7578890097932536, - "acc_stderr": 0.009994371269104387, - "acc_norm": 0.7611534276387377, - "acc_norm_stderr": 0.009948120385337484 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..6827722be905a8b454537cf270730190ef28b682 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.317,0.01472167543888022,0 +anli_r2,acc,0.314,0.014683991951087966,0 +anli_r3,acc,0.355,0.013819249004047296,0 +arc_challenge,acc,0.2883959044368601,0.013238394422428173,0 +arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0 +arc_easy,acc,0.6241582491582491,0.00993843637317063,0 +arc_easy,acc_norm,0.5833333333333334,0.010116282977781263,0 +boolq,acc,0.6030581039755352,0.00855727696467513,1 +cb,acc,0.30357142857142855,0.06199938655510754,1 +cb,f1,0.20076628352490422,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.477096195976897,0.004984543540932339,0 +hellaswag,acc_norm,0.6249751045608445,0.0048313992185002475,0 +piqa,acc,0.7595212187159956,0.009971345364651073,0 +piqa,acc_norm,0.750272034820457,0.010099232969867469,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.901,0.00944924802766276,0 +sciq,acc_norm,0.874,0.010499249222408035,0 +storycloze_2016,acc,0.7108498129342598,0.010484068799942077,0 +winogrande,acc,0.6006314127861089,0.013764933546717614,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json deleted file mode 100644 index c7d611b874702999fa36d38f1ff436427e69789c..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.317, - "acc_stderr": 0.01472167543888022 - }, - "anli_r2": { - "acc": 0.314, - "acc_stderr": 0.014683991951087966 - }, - "anli_r3": { - "acc": 0.355, - "acc_stderr": 0.013819249004047296 - }, - "cb": { - "acc": 0.30357142857142855, - "acc_stderr": 0.06199938655510754, - "f1": 0.20076628352490422 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.477096195976897, - "acc_stderr": 0.004984543540932339, - "acc_norm": 0.6249751045608445, - "acc_norm_stderr": 0.0048313992185002475 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.6006314127861089, - "acc_stderr": 0.013764933546717614 - }, - "storycloze_2016": { - "acc": 0.7108498129342598, - "acc_stderr": 0.010484068799942077 - }, - "boolq": { - "acc": 0.6030581039755352, - "acc_stderr": 0.00855727696467513 - }, - "arc_easy": { - "acc": 0.6241582491582491, - "acc_stderr": 0.00993843637317063, - "acc_norm": 0.5833333333333334, - "acc_norm_stderr": 0.010116282977781263 - }, - "arc_challenge": { - "acc": 0.2883959044368601, - "acc_stderr": 0.013238394422428173, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.01359243151906808 - }, - "sciq": { - "acc": 0.901, - "acc_stderr": 0.00944924802766276, - "acc_norm": 0.874, - "acc_norm_stderr": 0.010499249222408035 - }, - "piqa": { - "acc": 0.7595212187159956, - "acc_stderr": 0.009971345364651073, - "acc_norm": 0.750272034820457, - "acc_norm_stderr": 0.010099232969867469 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..3125c908f6494930737b630e362b05b34c65be25 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.306,0.014580006055436967,0 +anli_r2,acc,0.333,0.014910846164229863,0 +anli_r3,acc,0.31916666666666665,0.013462309712005143,0 +arc_challenge,acc,0.3003412969283277,0.013395909309957,0 +arc_challenge,acc_norm,0.3319112627986348,0.013760988200880536,0 +arc_easy,acc,0.6279461279461279,0.009918187193096471,0 +arc_easy,acc_norm,0.6069023569023569,0.010022540618945315,0 +boolq,acc,0.6165137614678899,0.008504304838837027,1 +cb,acc,0.17857142857142858,0.051642771820087224,1 +cb,f1,0.16652752931822698,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.4774945230033858,0.00498472423511512,0 +hellaswag,acc_norm,0.6274646484763992,0.004824917516374197,0 +piqa,acc,0.7535364526659413,0.01005481078967182,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.4981949458483754,0.030096267148976633,0 +sciq,acc,0.911,0.009008893392651523,0 +sciq,acc_norm,0.891,0.00985982840703719,0 +storycloze_2016,acc,0.7156600748262961,0.010431614128665244,0 +winogrande,acc,0.6029992107340174,0.013751092519806704,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json deleted file mode 100644 index 050ee91c296173fc8b82687a8422b9903bcee26d..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.014580006055436967 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229863 - }, - "anli_r3": { - "acc": 0.31916666666666665, - "acc_stderr": 0.013462309712005143 - }, - "cb": { - "acc": 0.17857142857142858, - "acc_stderr": 0.051642771820087224, - "f1": 0.16652752931822698 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4774945230033858, - "acc_stderr": 0.00498472423511512, - "acc_norm": 0.6274646484763992, - "acc_norm_stderr": 0.004824917516374197 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976633 - }, - "winogrande": { - "acc": 0.6029992107340174, - "acc_stderr": 0.013751092519806704 - }, - "storycloze_2016": { - "acc": 0.7156600748262961, - "acc_stderr": 0.010431614128665244 - }, - "boolq": { - "acc": 0.6165137614678899, - "acc_stderr": 0.008504304838837027 - }, - "arc_easy": { - "acc": 0.6279461279461279, - "acc_stderr": 0.009918187193096471, - "acc_norm": 0.6069023569023569, - "acc_norm_stderr": 0.010022540618945315 - }, - "arc_challenge": { - "acc": 0.3003412969283277, - "acc_stderr": 0.013395909309957, - "acc_norm": 0.3319112627986348, - "acc_norm_stderr": 0.013760988200880536 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651523, - "acc_norm": 0.891, - "acc_norm_stderr": 0.00985982840703719 - }, - "piqa": { - "acc": 0.7535364526659413, - "acc_stderr": 0.01005481078967182, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..18b87f9865de58efe5c332616eaf5f30544152fe --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928355,0 +anli_r2,acc,0.341,0.014998131348402704,0 +anli_r3,acc,0.325,0.013526454480351028,0 +arc_challenge,acc,0.29948805460750855,0.013385021637313565,0 +arc_challenge,acc_norm,0.33276450511945393,0.01376986304619231,0 +arc_easy,acc,0.6380471380471381,0.009860991466688486,0 +arc_easy,acc_norm,0.625,0.009933992677987828,0 +boolq,acc,0.6146788990825688,0.008511930879680645,1 +cb,acc,0.25,0.058387420812114225,1 +cb,f1,0.24860681114551084,,1 +copa,acc,0.84,0.03684529491774711,0 +hellaswag,acc,0.4788886675960964,0.004985331652408345,0 +hellaswag,acc_norm,0.6285600477992431,0.004822022254886021,0 +piqa,acc,0.7584330794341676,0.009986718001804463,0 +piqa,acc_norm,0.7562568008705114,0.010017199471500609,0 +rte,acc,0.48014440433212996,0.0300727231673172,0 +sciq,acc,0.916,0.008776162089491122,0 +sciq,acc_norm,0.9,0.009491579957525049,0 +storycloze_2016,acc,0.7172634954569749,0.01041380648612127,0 +winogrande,acc,0.590370955011839,0.013821049109655465,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json deleted file mode 100644 index 6ed579572798fc191e1081cee1c87c2a4dcdac31..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928355 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.014998131348402704 - }, - "anli_r3": { - "acc": 0.325, - "acc_stderr": 0.013526454480351028 - }, - "cb": { - "acc": 0.25, - "acc_stderr": 0.058387420812114225, - "f1": 0.24860681114551084 - }, - "copa": { - "acc": 0.84, - "acc_stderr": 0.03684529491774711 - }, - "hellaswag": { - "acc": 0.4788886675960964, - "acc_stderr": 0.004985331652408345, - "acc_norm": 0.6285600477992431, - "acc_norm_stderr": 0.004822022254886021 - }, - "rte": { - "acc": 0.48014440433212996, - "acc_stderr": 0.0300727231673172 - }, - "winogrande": { - "acc": 0.590370955011839, - "acc_stderr": 0.013821049109655465 - }, - "storycloze_2016": { - "acc": 0.7172634954569749, - "acc_stderr": 0.01041380648612127 - }, - "boolq": { - "acc": 0.6146788990825688, - "acc_stderr": 0.008511930879680645 - }, - "arc_easy": { - "acc": 0.6380471380471381, - "acc_stderr": 0.009860991466688486, - "acc_norm": 0.625, - "acc_norm_stderr": 0.009933992677987828 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.013385021637313565, - "acc_norm": 0.33276450511945393, - "acc_norm_stderr": 0.01376986304619231 - }, - "sciq": { - "acc": 0.916, - "acc_stderr": 0.008776162089491122, - "acc_norm": 0.9, - "acc_norm_stderr": 0.009491579957525049 - }, - "piqa": { - "acc": 0.7584330794341676, - "acc_stderr": 0.009986718001804463, - "acc_norm": 0.7562568008705114, - "acc_norm_stderr": 0.010017199471500609 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..b0753b8c1660ba16881ecab407659d8f5c95cbf0 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932577,0 +anli_r2,acc,0.337,0.014955087918653609,0 +anli_r3,acc,0.3325,0.013605417345710528,0 +arc_challenge,acc,0.29948805460750855,0.013385021637313563,0 +arc_challenge,acc_norm,0.3361774744027304,0.013804855026205763,0 +arc_easy,acc,0.6380471380471381,0.00986099146668847,0 +arc_easy,acc_norm,0.6216329966329966,0.00995157568333195,0 +boolq,acc,0.6192660550458715,0.008492625561656215,1 +cb,acc,0.23214285714285715,0.056929390240001085,1 +cb,f1,0.23148148148148148,,1 +copa,acc,0.84,0.0368452949177471,0 +hellaswag,acc,0.47470623381796456,0.004983392650570959,0 +hellaswag,acc_norm,0.6319458275243975,0.004812905279066442,0 +piqa,acc,0.7529923830250272,0.010062268140772625,0 +piqa,acc_norm,0.7568008705114254,0.010009611953858917,0 +rte,acc,0.47653429602888087,0.030063300411902652,0 +sciq,acc,0.915,0.008823426366942323,0 +sciq,acc_norm,0.91,0.009054390204866444,0 +storycloze_2016,acc,0.721004810261892,0.010371620932652795,0 +winogrande,acc,0.6108918705603789,0.013702520871485949,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json deleted file mode 100644 index 1ed659ead93c21380463e58b56a5725340bdbbdd..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932577 - }, - "anli_r2": { - "acc": 0.337, - "acc_stderr": 0.014955087918653609 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.013605417345710528 - }, - "cb": { - "acc": 0.23214285714285715, - "acc_stderr": 0.056929390240001085, - "f1": 0.23148148148148148 - }, - "copa": { - "acc": 0.84, - "acc_stderr": 0.0368452949177471 - }, - "hellaswag": { - "acc": 0.47470623381796456, - "acc_stderr": 0.004983392650570959, - "acc_norm": 0.6319458275243975, - "acc_norm_stderr": 0.004812905279066442 - }, - "rte": { - "acc": 0.47653429602888087, - "acc_stderr": 0.030063300411902652 - }, - "winogrande": { - "acc": 0.6108918705603789, - "acc_stderr": 0.013702520871485949 - }, - "storycloze_2016": { - "acc": 0.721004810261892, - "acc_stderr": 0.010371620932652795 - }, - "boolq": { - "acc": 0.6192660550458715, - "acc_stderr": 0.008492625561656215 - }, - "arc_easy": { - "acc": 0.6380471380471381, - "acc_stderr": 0.00986099146668847, - "acc_norm": 0.6216329966329966, - "acc_norm_stderr": 0.00995157568333195 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.013385021637313563, - "acc_norm": 0.3361774744027304, - "acc_norm_stderr": 0.013804855026205763 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.008823426366942323, - "acc_norm": 0.91, - "acc_norm_stderr": 0.009054390204866444 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.010062268140772625, - "acc_norm": 0.7568008705114254, - "acc_norm_stderr": 0.010009611953858917 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5.csv b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..4a2bfc0770e75a9f4acc580047dd7838dee7aa15 --- /dev/null +++ b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.324,0.01480686473373886,0 +anli_r2,acc,0.329,0.014865395385928359,0 +anli_r3,acc,0.3333333333333333,0.01361395001022561,0 +arc_challenge,acc,0.29948805460750855,0.013385021637313565,0 +arc_challenge,acc_norm,0.3267918088737201,0.01370666597558734,0 +arc_easy,acc,0.6384680134680135,0.00985850654316206,0 +arc_easy,acc_norm,0.625,0.009933992677987828,0 +boolq,acc,0.6311926605504588,0.008438656079759072,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.33391833391833387,,1 +copa,acc,0.82,0.03861229196653697,0 +hellaswag,acc,0.4781915952997411,0.004985032806802436,0 +hellaswag,acc_norm,0.6330412268472416,0.004809901151234833,0 +piqa,acc,0.7568008705114254,0.010009611953858917,0 +piqa,acc_norm,0.7589771490750816,0.009979042717267315,0 +rte,acc,0.5379061371841155,0.030009848912529117,0 +sciq,acc,0.913,0.008916866630745906,0 +sciq,acc_norm,0.908,0.0091443763931511,0 +storycloze_2016,acc,0.7295563869588455,0.010271810373331022,0 +winogrande,acc,0.5927387529597474,0.013808654122417845,0 diff --git a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json b/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json deleted file mode 100644 index 1f1459c2cfb492dfda59c7b44d1e462772df3427..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed3/evaluation/rankeval/4b284b42bc4seed3_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.324, - "acc_stderr": 0.01480686473373886 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928359 - }, - "anli_r3": { - "acc": 0.3333333333333333, - "acc_stderr": 0.01361395001022561 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.33391833391833387 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.03861229196653697 - }, - "hellaswag": { - "acc": 0.4781915952997411, - "acc_stderr": 0.004985032806802436, - "acc_norm": 0.6330412268472416, - "acc_norm_stderr": 0.004809901151234833 - }, - "rte": { - "acc": 0.5379061371841155, - "acc_stderr": 0.030009848912529117 - }, - "winogrande": { - "acc": 0.5927387529597474, - "acc_stderr": 0.013808654122417845 - }, - "storycloze_2016": { - "acc": 0.7295563869588455, - "acc_stderr": 0.010271810373331022 - }, - "boolq": { - "acc": 0.6311926605504588, - "acc_stderr": 0.008438656079759072 - }, - "arc_easy": { - "acc": 0.6384680134680135, - "acc_stderr": 0.00985850654316206, - "acc_norm": 0.625, - "acc_norm_stderr": 0.009933992677987828 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.013385021637313565, - "acc_norm": 0.3267918088737201, - "acc_norm_stderr": 0.01370666597558734 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.008916866630745906, - "acc_norm": 0.908, - "acc_norm_stderr": 0.0091443763931511 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858917, - "acc_norm": 0.7589771490750816, - "acc_norm_stderr": 0.009979042717267315 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..989ece609642277bcda8cb97df460b8f115d164b --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.314,0.01468399195108797,0 +anli_r2,acc,0.332,0.014899597242811478,0 +anli_r3,acc,0.33666666666666667,0.013647602942406401,0 +arc_challenge,acc,0.2696245733788396,0.012968040686869142,0 +arc_challenge,acc_norm,0.30119453924914674,0.013406741767847627,0 +arc_easy,acc,0.6186868686868687,0.009966542497171016,0 +arc_easy,acc_norm,0.5349326599326599,0.010234713052723679,0 +boolq,acc,0.6162079510703364,0.008505584729104973,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.2689474934663815,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.4790878311093408,0.004985415250690911,0 +hellaswag,acc_norm,0.6315475004979088,0.0048139910698082634,0 +piqa,acc,0.7562568008705114,0.010017199471500619,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5703971119133574,0.02979666882912467,0 +sciq,acc,0.86,0.010978183844357801,0 +sciq,acc_norm,0.779,0.013127502859696239,0 +storycloze_2016,acc,0.7177979690005345,0.010407834479647673,0 +winogrande,acc,0.5737963693764798,0.013898585965412338,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json deleted file mode 100644 index 02adc5b249eac66bfad71d3b5d58ea556a64b902..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_0_lm-eval_global_step80108_2023-02-15-11-04-04_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.01468399195108797 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811478 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406401 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.2689474934663815 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.4790878311093408, - "acc_stderr": 0.004985415250690911, - "acc_norm": 0.6315475004979088, - "acc_norm_stderr": 0.0048139910698082634 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.02979666882912467 - }, - "winogrande": { - "acc": 0.5737963693764798, - "acc_stderr": 0.013898585965412338 - }, - "storycloze_2016": { - "acc": 0.7177979690005345, - "acc_stderr": 0.010407834479647673 - }, - "boolq": { - "acc": 0.6162079510703364, - "acc_stderr": 0.008505584729104973 - }, - "arc_easy": { - "acc": 0.6186868686868687, - "acc_stderr": 0.009966542497171016, - "acc_norm": 0.5349326599326599, - "acc_norm_stderr": 0.010234713052723679 - }, - "arc_challenge": { - "acc": 0.2696245733788396, - "acc_stderr": 0.012968040686869142, - "acc_norm": 0.30119453924914674, - "acc_norm_stderr": 0.013406741767847627 - }, - "sciq": { - "acc": 0.86, - "acc_stderr": 0.010978183844357801, - "acc_norm": 0.779, - "acc_norm_stderr": 0.013127502859696239 - }, - "piqa": { - "acc": 0.7562568008705114, - "acc_stderr": 0.010017199471500619, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..73390d9e197760b82077e239bd54bf0048e45089 --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348637,0 +anli_r2,acc,0.325,0.014818724459095527,0 +anli_r3,acc,0.33666666666666667,0.013647602942406389,0 +arc_challenge,acc,0.2713310580204778,0.012993807727545801,0 +arc_challenge,acc_norm,0.3054607508532423,0.01346008047800251,0 +arc_easy,acc,0.6321548821548821,0.009894923464455196,0 +arc_easy,acc_norm,0.5795454545454546,0.010129114278546524,0 +boolq,acc,0.6235474006116208,0.008473882279194588,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.32592592592592595,,1 +copa,acc,0.74,0.04408440022768077,0 +hellaswag,acc,0.4780920135431189,0.004984989320648131,0 +hellaswag,acc_norm,0.6283608842859988,0.004822550638450904,0 +piqa,acc,0.7546245919477693,0.0100398313204224,0 +piqa,acc_norm,0.7622415669205659,0.009932525779525489,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.905,0.009276910103103326,0 +sciq,acc_norm,0.873,0.010534798620855759,0 +storycloze_2016,acc,0.7135221806520577,0.01045510591863303,0 +winogrande,acc,0.5895816890292028,0.013825107120035863,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-04_1shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-04_1shots_backup.json deleted file mode 100644 index 33a600498da310639889ae6a0d0921d4a9a5f3b9..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_1_lm-eval_global_step80108_2023-02-15-11-04-04_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348637 - }, - "anli_r2": { - "acc": 0.325, - "acc_stderr": 0.014818724459095527 - }, - "anli_r3": { - "acc": 0.33666666666666667, - "acc_stderr": 0.013647602942406389 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.32592592592592595 - }, - "copa": { - "acc": 0.74, - "acc_stderr": 0.04408440022768077 - }, - "hellaswag": { - "acc": 0.4780920135431189, - "acc_stderr": 0.004984989320648131, - "acc_norm": 0.6283608842859988, - "acc_norm_stderr": 0.004822550638450904 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.013825107120035863 - }, - "storycloze_2016": { - "acc": 0.7135221806520577, - "acc_stderr": 0.01045510591863303 - }, - "boolq": { - "acc": 0.6235474006116208, - "acc_stderr": 0.008473882279194588 - }, - "arc_easy": { - "acc": 0.6321548821548821, - "acc_stderr": 0.009894923464455196, - "acc_norm": 0.5795454545454546, - "acc_norm_stderr": 0.010129114278546524 - }, - "arc_challenge": { - "acc": 0.2713310580204778, - "acc_stderr": 0.012993807727545801, - "acc_norm": 0.3054607508532423, - "acc_norm_stderr": 0.01346008047800251 - }, - "sciq": { - "acc": 0.905, - "acc_stderr": 0.009276910103103326, - "acc_norm": 0.873, - "acc_norm_stderr": 0.010534798620855759 - }, - "piqa": { - "acc": 0.7546245919477693, - "acc_stderr": 0.0100398313204224, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525489 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..ad79e3410eb4f3b64c370f7b3696b2546ed7a245 --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.014842213153411249,0 +anli_r2,acc,0.35,0.015090650341444233,0 +anli_r3,acc,0.325,0.013526454480351025,0 +arc_challenge,acc,0.2841296928327645,0.013179442447653886,0 +arc_challenge,acc_norm,0.30802047781569963,0.013491429517292038,0 +arc_easy,acc,0.6372053872053872,0.009865936757013938,0 +arc_easy,acc_norm,0.6077441077441077,0.010018744689650043,0 +boolq,acc,0.6342507645259939,0.00842393006885078,1 +cb,acc,0.4107142857142857,0.06633634150359541,1 +cb,f1,0.26894586894586897,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.4736108344951205,0.0049828269166871525,0 +hellaswag,acc_norm,0.6298546106353317,0.004818566366066934,0 +piqa,acc,0.7611534276387377,0.0099481203853375,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804451,0 +rte,acc,0.5487364620938628,0.029953149241808943,0 +sciq,acc,0.911,0.009008893392651528,0 +sciq,acc_norm,0.885,0.010093407594904638,0 +storycloze_2016,acc,0.7188669160876536,0.010395836091628108,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json deleted file mode 100644 index 21a4a1023e4f8a1e765c5e617d5dda124fd4e7de..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_2_lm-eval_global_step80108_2023-02-15-11-04-04_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.014842213153411249 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r3": { - "acc": 0.325, - "acc_stderr": 0.013526454480351025 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.06633634150359541, - "f1": 0.26894586894586897 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4736108344951205, - "acc_stderr": 0.0049828269166871525, - "acc_norm": 0.6298546106353317, - "acc_norm_stderr": 0.004818566366066934 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808943 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7188669160876536, - "acc_stderr": 0.010395836091628108 - }, - "boolq": { - "acc": 0.6342507645259939, - "acc_stderr": 0.00842393006885078 - }, - "arc_easy": { - "acc": 0.6372053872053872, - "acc_stderr": 0.009865936757013938, - "acc_norm": 0.6077441077441077, - "acc_norm_stderr": 0.010018744689650043 - }, - "arc_challenge": { - "acc": 0.2841296928327645, - "acc_stderr": 0.013179442447653886, - "acc_norm": 0.30802047781569963, - "acc_norm_stderr": 0.013491429517292038 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651528, - "acc_norm": 0.885, - "acc_norm_stderr": 0.010093407594904638 - }, - "piqa": { - "acc": 0.7611534276387377, - "acc_stderr": 0.0099481203853375, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804451 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..0f62aadd5300aff12b262be649afc4ed89bf97e9 --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.306,0.014580006055436967,0 +anli_r2,acc,0.359,0.01517726422479859,0 +anli_r3,acc,0.35583333333333333,0.01382651874849331,0 +arc_challenge,acc,0.2773037542662116,0.013082095839059376,0 +arc_challenge,acc_norm,0.3174061433447099,0.01360223908803817,0 +arc_easy,acc,0.6430976430976431,0.009830630210347012,0 +arc_easy,acc_norm,0.622895622895623,0.00994504194636652,0 +boolq,acc,0.634862385321101,0.008420941009417815,1 +cb,acc,0.5714285714285714,0.06672848092813058,1 +cb,f1,0.5178689064558629,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.47540330611431986,0.004983740145218613,0 +hellaswag,acc_norm,0.630551682931687,0.004816690123209743,0 +piqa,acc,0.7573449401523396,0.010002002569708698,0 +piqa,acc_norm,0.766050054406964,0.00987723689513744,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.906,0.009233052000787733,0 +storycloze_2016,acc,0.7258150721539284,0.010316062787590011,0 +winogrande,acc,0.5919494869771112,0.013812822643745028,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json deleted file mode 100644 index f069af8de1fd5ef6b8483e1c05238888c85af2d7..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_3_lm-eval_global_step80108_2023-02-15-11-04-04_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.014580006055436967 - }, - "anli_r2": { - "acc": 0.359, - "acc_stderr": 0.01517726422479859 - }, - "anli_r3": { - "acc": 0.35583333333333333, - "acc_stderr": 0.01382651874849331 - }, - "cb": { - "acc": 0.5714285714285714, - "acc_stderr": 0.06672848092813058, - "f1": 0.5178689064558629 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.47540330611431986, - "acc_stderr": 0.004983740145218613, - "acc_norm": 0.630551682931687, - "acc_norm_stderr": 0.004816690123209743 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5919494869771112, - "acc_stderr": 0.013812822643745028 - }, - "storycloze_2016": { - "acc": 0.7258150721539284, - "acc_stderr": 0.010316062787590011 - }, - "boolq": { - "acc": 0.634862385321101, - "acc_stderr": 0.008420941009417815 - }, - "arc_easy": { - "acc": 0.6430976430976431, - "acc_stderr": 0.009830630210347012, - "acc_norm": 0.622895622895623, - "acc_norm_stderr": 0.00994504194636652 - }, - "arc_challenge": { - "acc": 0.2773037542662116, - "acc_stderr": 0.013082095839059376, - "acc_norm": 0.3174061433447099, - "acc_norm_stderr": 0.01360223908803817 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.906, - "acc_norm_stderr": 0.009233052000787733 - }, - "piqa": { - "acc": 0.7573449401523396, - "acc_stderr": 0.010002002569708698, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.00987723689513744 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..023a5f8e356495dbad907528ba8e4859e5920976 --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.338,0.014965960710224482,0 +anli_r2,acc,0.379,0.01534909100222535,0 +anli_r3,acc,0.35333333333333333,0.013804572162314937,0 +arc_challenge,acc,0.29180887372013653,0.013284525292403506,0 +arc_challenge,acc_norm,0.3054607508532423,0.013460080478002505,0 +arc_easy,acc,0.6460437710437711,0.009812370644174426,0 +arc_easy,acc_norm,0.6241582491582491,0.009938436373170616,0 +boolq,acc,0.636085626911315,0.008414918909128852,1 +cb,acc,0.44642857142857145,0.06703189227942398,1 +cb,f1,0.2940620782726046,,1 +copa,acc,0.82,0.03861229196653697,0 +hellaswag,acc,0.476000796654053,0.004984030250507291,0 +hellaswag,acc_norm,0.6342362079267079,0.004806593424942258,0 +piqa,acc,0.7589771490750816,0.009979042717267314,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.918,0.008680515615523725,0 +sciq,acc_norm,0.914,0.008870325962594766,0 +storycloze_2016,acc,0.7284874398717264,0.010284547617192592,0 +winogrande,acc,0.6101026045777427,0.013707547317008463,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json deleted file mode 100644 index 99afb38557c454f663b3ade53438c41e691aa6d4..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_4_lm-eval_global_step80108_2023-02-15-11-04-04_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.338, - "acc_stderr": 0.014965960710224482 - }, - "anli_r2": { - "acc": 0.379, - "acc_stderr": 0.01534909100222535 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314937 - }, - "cb": { - "acc": 0.44642857142857145, - "acc_stderr": 0.06703189227942398, - "f1": 0.2940620782726046 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.03861229196653697 - }, - "hellaswag": { - "acc": 0.476000796654053, - "acc_stderr": 0.004984030250507291, - "acc_norm": 0.6342362079267079, - "acc_norm_stderr": 0.004806593424942258 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.6101026045777427, - "acc_stderr": 0.013707547317008463 - }, - "storycloze_2016": { - "acc": 0.7284874398717264, - "acc_stderr": 0.010284547617192592 - }, - "boolq": { - "acc": 0.636085626911315, - "acc_stderr": 0.008414918909128852 - }, - "arc_easy": { - "acc": 0.6460437710437711, - "acc_stderr": 0.009812370644174426, - "acc_norm": 0.6241582491582491, - "acc_norm_stderr": 0.009938436373170616 - }, - "arc_challenge": { - "acc": 0.29180887372013653, - "acc_stderr": 0.013284525292403506, - "acc_norm": 0.3054607508532423, - "acc_norm_stderr": 0.013460080478002505 - }, - "sciq": { - "acc": 0.918, - "acc_stderr": 0.008680515615523725, - "acc_norm": 0.914, - "acc_norm_stderr": 0.008870325962594766 - }, - "piqa": { - "acc": 0.7589771490750816, - "acc_stderr": 0.009979042717267314, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5.csv b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..a40fd6842dc5fff6b88efe614a2709e15ceb092f --- /dev/null +++ b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.354,0.015129868238451772,0 +anli_r2,acc,0.336,0.014944140233795016,0 +anli_r3,acc,0.3475,0.013751753243291854,0 +arc_challenge,acc,0.2909556313993174,0.01327307786590758,0 +arc_challenge,acc_norm,0.3225255972696246,0.01365998089427737,0 +arc_easy,acc,0.648989898989899,0.009793703885101045,0 +arc_easy,acc_norm,0.6199494949494949,0.009960175831493126,0 +boolq,acc,0.6345565749235474,0.008422437370062704,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.2828828828828829,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.4772953594901414,0.004984634285101618,0 +hellaswag,acc_norm,0.6366261700856403,0.00479988224849481,0 +piqa,acc,0.7568008705114254,0.010009611953858922,0 +piqa,acc_norm,0.766050054406964,0.009877236895137437,0 +rte,acc,0.5595667870036101,0.029882123363118726,0 +sciq,acc,0.92,0.008583336977753653,0 +sciq,acc_norm,0.916,0.00877616208949112,0 +storycloze_2016,acc,0.7274184927846071,0.010297209765351286,0 +winogrande,acc,0.6093133385951065,0.013712536036556667,0 diff --git a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json b/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json deleted file mode 100644 index 07226d9e407f767cbc216086116703688e4cb3b6..0000000000000000000000000000000000000000 --- a/4b284b42bc4seed4/evaluation/rankeval/4b284b42bc4seed4_5_lm-eval_global_step80108_2023-02-15-11-04-04_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.354, - "acc_stderr": 0.015129868238451772 - }, - "anli_r2": { - "acc": 0.336, - "acc_stderr": 0.014944140233795016 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291854 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.2828828828828829 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.4772953594901414, - "acc_stderr": 0.004984634285101618, - "acc_norm": 0.6366261700856403, - "acc_norm_stderr": 0.00479988224849481 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118726 - }, - "winogrande": { - "acc": 0.6093133385951065, - "acc_stderr": 0.013712536036556667 - }, - "storycloze_2016": { - "acc": 0.7274184927846071, - "acc_stderr": 0.010297209765351286 - }, - "boolq": { - "acc": 0.6345565749235474, - "acc_stderr": 0.008422437370062704 - }, - "arc_easy": { - "acc": 0.648989898989899, - "acc_stderr": 0.009793703885101045, - "acc_norm": 0.6199494949494949, - "acc_norm_stderr": 0.009960175831493126 - }, - "arc_challenge": { - "acc": 0.2909556313993174, - "acc_stderr": 0.01327307786590758, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.01365998089427737 - }, - "sciq": { - "acc": 0.92, - "acc_stderr": 0.008583336977753653, - "acc_norm": 0.916, - "acc_norm_stderr": 0.00877616208949112 - }, - "piqa": { - "acc": 0.7568008705114254, - "acc_stderr": 0.010009611953858922, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.009877236895137437 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/generation/merged.csv b/4b284b84bc4v2seed1/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..ddafac3b8578a24a49667ef747c4abaa6d9eba6a --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0067785024654653135 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0067785024654653135 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.147547294117922 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.147547294117922 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19477780513184206 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.19477780513184206 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.20169349699148847 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.20169349699148847 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20428378680082976 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20428378680082976 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.20717938591913995 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.20717938591913995 +e2e_nlg_cleaned,5,average,multiple,0.16037671190444794 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04496383454282395 +gem_xsum,0,median,rouge2_fmeasure,0.04496383454282395 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.0356226282886433 +gem_xsum,1,median,rouge2_fmeasure,0.0356226282886433 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03905458313366822 +gem_xsum,2,median,rouge2_fmeasure,0.03905458313366822 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03732502337389758 +gem_xsum,3,median,rouge2_fmeasure,0.03732502337389758 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01037190909743005 +gem_xsum,4,median,rouge2_fmeasure,0.01037190909743005 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00023488994329474606 +gem_xsum,5,median,rouge2_fmeasure,0.00023488994329474606 +gem_xsum,5,average,multiple,0.027928811396626306 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05650691632396146 +web_nlg_en,0,median,rouge2_fmeasure,0.05650691632396146 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.06046213786394769 +web_nlg_en,1,median,rouge2_fmeasure,0.06046213786394769 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.06291009585723452 +web_nlg_en,2,median,rouge2_fmeasure,0.06291009585723452 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06493911358433024 +web_nlg_en,3,median,rouge2_fmeasure,0.06493911358433024 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.06350109303343363 +web_nlg_en,4,median,rouge2_fmeasure,0.06350109303343363 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.0628361162345776 +web_nlg_en,5,median,rouge2_fmeasure,0.0628361162345776 +web_nlg_en,5,average,multiple,0.06185924548291419 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03594833738750215 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03594833738750215 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05228896303304648 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05228896303304648 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05758155471699364 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05758155471699364 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04621392992407233 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04621392992407233 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01447488452837446 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01447488452837446 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002282481160760156 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002282481160760156 +wiki_lingua_en,5,average,multiple,0.0347983584584582 diff --git a/4b284b84bc4v2seed1/evaluation/generation/merged.json b/4b284b84bc4v2seed1/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..b2ed28583017a429daa23472176370553d394870 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.34326729829531105, "bleu_stderr": 0.03358637261998785, "rouge1_fmeasure": 0.11916431891109731, "rouge1_fmeasure_stderr": 0.0021175179570425535, "rouge1_precision": 0.07862630340742671, "rouge1_precision_stderr": 0.0016136694824123174, "rouge1_recall": 0.32369911891592423, "rouge1_recall_stderr": 0.00451248293494817, "rouge2_fmeasure": 0.05650691632396146, "rouge2_fmeasure_stderr": 0.0013525890239055648, "rouge2_precision": 0.037063617321827565, "rouge2_precision_stderr": 0.0009944922058352445, "rouge2_recall": 0.1576672341633246, "rouge2_recall_stderr": 0.003224711999438015, "rougeL_fmeasure": 0.11433859915415578, "rougeL_fmeasure_stderr": 0.0019492218338629818, "rougeL_precision": 0.07513715808830057, "rougeL_precision_stderr": 0.0014618460125218682, "rougeL_recall": 0.3132333887860453, "rougeL_recall_stderr": 0.004386685214813446, "rougeLsum_fmeasure": 0.11352608023289806, "rougeLsum_fmeasure_stderr": 0.0019780381016784067, "rougeLsum_precision": 0.07483474027103894, "rougeLsum_precision_stderr": 0.0015052225414928463, "rougeLsum_recall": 0.30940914625977445, "rougeLsum_recall_stderr": 0.004284176854457974}}, "1": {"PALM_prompt": {"bleu": 0.5948862237229019, "bleu_stderr": 0.039582663632953784, "rouge1_fmeasure": 0.12765607517625405, "rouge1_fmeasure_stderr": 0.0018849201553444409, "rouge1_precision": 0.0816587124699168, "rouge1_precision_stderr": 0.0014085940124898722, "rouge1_recall": 0.41376102556798416, "rouge1_recall_stderr": 0.005120285313342278, "rouge2_fmeasure": 0.06046213786394769, "rouge2_fmeasure_stderr": 0.0012051323603965336, "rouge2_precision": 0.03849526777850954, "rouge2_precision_stderr": 0.0008702558598728688, "rouge2_recall": 0.2085100408577, "rouge2_recall_stderr": 0.0038038005581152424, "rougeL_fmeasure": 0.1201138678185169, "rougeL_fmeasure_stderr": 0.001700210672201901, "rougeL_precision": 0.07670875557418717, "rougeL_precision_stderr": 0.0012508227867979015, "rougeL_recall": 0.3877812200148867, "rougeL_recall_stderr": 0.004701589867320948, "rougeLsum_fmeasure": 0.1213070354914201, "rougeLsum_fmeasure_stderr": 0.0017782747477851136, "rougeLsum_precision": 0.07767059791046184, "rougeLsum_precision_stderr": 0.0013306591210432749, "rougeLsum_recall": 0.3916530889765753, "rougeLsum_recall_stderr": 0.004748463468558481}}, "2": {"PALM_prompt": {"bleu": 0.6537228269018129, "bleu_stderr": 0.035990435892814046, "rouge1_fmeasure": 0.13201614272291048, "rouge1_fmeasure_stderr": 0.0018780895794891634, "rouge1_precision": 0.08384544140823398, "rouge1_precision_stderr": 0.001386560526866323, "rouge1_recall": 0.4331359104245019, "rouge1_recall_stderr": 0.005220604608607641, "rouge2_fmeasure": 0.06291009585723452, "rouge2_fmeasure_stderr": 0.0012038722951742996, "rouge2_precision": 0.03970512762200282, "rouge2_precision_stderr": 0.0008537385186468243, "rouge2_recall": 0.22235505391512328, "rouge2_recall_stderr": 0.0039417962741365555, "rougeL_fmeasure": 0.1228295838153615, "rougeL_fmeasure_stderr": 0.0016541288388338322, "rougeL_precision": 0.07794807354372117, "rougeL_precision_stderr": 0.0012143425476458327, "rougeL_recall": 0.40229620579661823, "rougeL_recall_stderr": 0.00469923128612557, "rougeLsum_fmeasure": 0.12485060584424827, "rougeLsum_fmeasure_stderr": 0.0017432213357476526, "rougeLsum_precision": 0.07935198772750388, "rougeLsum_precision_stderr": 0.0012897989845438466, "rougeLsum_recall": 0.40869894505460913, "rougeLsum_recall_stderr": 0.004807761564841007}}, "3": {"PALM_prompt": {"bleu": 0.7688161900114325, "bleu_stderr": 0.042616105589279835, "rouge1_fmeasure": 0.13565333331245233, "rouge1_fmeasure_stderr": 0.0019214001825423423, "rouge1_precision": 0.08581431850654406, "rouge1_precision_stderr": 0.0014091207858822378, "rouge1_recall": 0.451020251123445, "rouge1_recall_stderr": 0.005322257735569716, "rouge2_fmeasure": 0.06493911358433024, "rouge2_fmeasure_stderr": 0.0012189671754640097, "rouge2_precision": 0.04079595838580081, "rouge2_precision_stderr": 0.0008598531728946593, "rouge2_recall": 0.2315610050442737, "rouge2_recall_stderr": 0.003940553650230959, "rougeL_fmeasure": 0.12512904904228123, "rougeL_fmeasure_stderr": 0.0016792174221316413, "rougeL_precision": 0.07911531057680408, "rougeL_precision_stderr": 0.0012306486411544932, "rougeL_recall": 0.41550199843423724, "rougeL_recall_stderr": 0.0046820104549203595, "rougeLsum_fmeasure": 0.12810389486368845, "rougeLsum_fmeasure_stderr": 0.0017841996987519588, "rougeLsum_precision": 0.08110613355742896, "rougeLsum_precision_stderr": 0.0013144098266559656, "rougeLsum_recall": 0.42560098030356347, "rougeLsum_recall_stderr": 0.004883459517511784}}, "4": {"PALM_prompt": {"bleu": 0.7570960984336995, "bleu_stderr": 0.045096655250586745, "rouge1_fmeasure": 0.13513090909445435, "rouge1_fmeasure_stderr": 0.001857462116626778, "rouge1_precision": 0.08529632027255797, "rouge1_precision_stderr": 0.0013615215584273299, "rouge1_recall": 0.4473685435747539, "rouge1_recall_stderr": 0.005108952082458071, "rouge2_fmeasure": 0.06350109303343363, "rouge2_fmeasure_stderr": 0.0011816726391683833, "rouge2_precision": 0.039815339309114045, "rouge2_precision_stderr": 0.0008282825483537941, "rouge2_recall": 0.22633120487799224, "rouge2_recall_stderr": 0.003860666223150301, "rougeL_fmeasure": 0.12393163259884432, "rougeL_fmeasure_stderr": 0.0016143790828061306, "rougeL_precision": 0.07817291391555115, "rougeL_precision_stderr": 0.0011817095376910824, "rougeL_recall": 0.4102750537372601, "rougeL_recall_stderr": 0.0045365995201896055, "rougeLsum_fmeasure": 0.12775964589335537, "rougeLsum_fmeasure_stderr": 0.0017354753598928022, "rougeLsum_precision": 0.08068706565512851, "rougeLsum_precision_stderr": 0.0012755251012248334, "rougeLsum_recall": 0.42255633147412197, "rougeLsum_recall_stderr": 0.004729478013075473}}, "5": {"PALM_prompt": {"bleu": 0.8104376401030898, "bleu_stderr": 0.037763333643428175, "rouge1_fmeasure": 0.13404599680810675, "rouge1_fmeasure_stderr": 0.0018055699183364886, "rouge1_precision": 0.0838447069828142, "rouge1_precision_stderr": 0.0012980600976676352, "rouge1_recall": 0.4625986883261239, "rouge1_recall_stderr": 0.005264921888949228, "rouge2_fmeasure": 0.0628361162345776, "rouge2_fmeasure_stderr": 0.0011500189815248805, "rouge2_precision": 0.0390357371719713, "rouge2_precision_stderr": 0.0007974267332581108, "rouge2_recall": 0.234619890150141, "rouge2_recall_stderr": 0.003959729264087865, "rougeL_fmeasure": 0.12166806010708248, "rougeL_fmeasure_stderr": 0.0015507413942357185, "rougeL_precision": 0.07615211553464588, "rougeL_precision_stderr": 0.0011261242462889827, "rougeL_recall": 0.41996017345442693, "rougeL_recall_stderr": 0.004602021801069344, "rougeLsum_fmeasure": 0.12626669858481684, "rougeLsum_fmeasure_stderr": 0.0016876984680813949, "rougeLsum_precision": 0.07909555784969943, "rougeLsum_precision_stderr": 0.001223667316440974, "rougeLsum_recall": 0.4344646979883619, "rougeLsum_recall_stderr": 0.0048111289087792546}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4851646334072905, "bleu_stderr": 0.04861340229236376, "rouge1_fmeasure": 0.17730225364977723, "rouge1_fmeasure_stderr": 0.0018050005614776118, "rouge1_precision": 0.15124630792509844, "rouge1_precision_stderr": 0.0018485365387243876, "rouge1_recall": 0.25785648347728835, "rouge1_recall_stderr": 0.002611440833308902, "rouge2_fmeasure": 0.03594833738750215, "rouge2_fmeasure_stderr": 0.0008314088155865108, "rouge2_precision": 0.030400917702662122, "rouge2_precision_stderr": 0.0007326144175326317, "rouge2_recall": 0.05411845535920983, "rouge2_recall_stderr": 0.0014089873929772655, "rougeL_fmeasure": 0.13868093485641456, "rougeL_fmeasure_stderr": 0.0012767260559905826, "rougeL_precision": 0.11672273037859081, "rougeL_precision_stderr": 0.0012669062046531912, "rougeL_recall": 0.20676211608950493, "rougeL_recall_stderr": 0.0021096802792339516, "rougeLsum_fmeasure": 0.16297678262753268, "rougeLsum_fmeasure_stderr": 0.0016388774476537506, "rougeLsum_precision": 0.1387409124133778, "rougeLsum_precision_stderr": 0.0016751685771755596, "rougeLsum_recall": 0.23796507894850802, "rougeLsum_recall_stderr": 0.002413535139954069}}, "1": {"tldr_en": {"bleu": 2.7113252535334103, "bleu_stderr": 0.07184250573639654, "rouge1_fmeasure": 0.2195978679253935, "rouge1_fmeasure_stderr": 0.0019521444699384023, "rouge1_precision": 0.19128048453334281, "rouge1_precision_stderr": 0.002178202778422233, "rouge1_recall": 0.3171923252436733, "rouge1_recall_stderr": 0.0028849834599709145, "rouge2_fmeasure": 0.05228896303304648, "rouge2_fmeasure_stderr": 0.001024500871599731, "rouge2_precision": 0.04579081400229089, "rouge2_precision_stderr": 0.0010575290889807657, "rouge2_recall": 0.07844442051330008, "rouge2_recall_stderr": 0.0017115727659030122, "rougeL_fmeasure": 0.15226535966158725, "rougeL_fmeasure_stderr": 0.0012923274097485509, "rougeL_precision": 0.13178912444678498, "rougeL_precision_stderr": 0.0014864696909304795, "rougeL_recall": 0.22531583868456084, "rougeL_recall_stderr": 0.0022069889213878597, "rougeLsum_fmeasure": 0.2057425849050316, "rougeLsum_fmeasure_stderr": 0.0018338933366344808, "rougeLsum_precision": 0.1791478113719481, "rougeLsum_precision_stderr": 0.0020496336269992233, "rougeLsum_recall": 0.2976216303928859, "rougeLsum_recall_stderr": 0.0027278126831615543}}, "2": {"tldr_en": {"bleu": 3.195118342048628, "bleu_stderr": 0.05475267759048033, "rouge1_fmeasure": 0.22513952082684566, "rouge1_fmeasure_stderr": 0.001916989265395324, "rouge1_precision": 0.1995166451908506, "rouge1_precision_stderr": 0.002256566772556673, "rouge1_recall": 0.32105181579367875, "rouge1_recall_stderr": 0.0028249389877146396, "rouge2_fmeasure": 0.05758155471699364, "rouge2_fmeasure_stderr": 0.0010823368692275683, "rouge2_precision": 0.050978179343173766, "rouge2_precision_stderr": 0.0010942220674854295, "rouge2_recall": 0.08450909984099246, "rouge2_recall_stderr": 0.0017722984734009522, "rougeL_fmeasure": 0.1591666469131682, "rougeL_fmeasure_stderr": 0.0013335400035532849, "rougeL_precision": 0.14026719913503174, "rougeL_precision_stderr": 0.0015950912413380067, "rougeL_recall": 0.23225597459093478, "rougeL_recall_stderr": 0.002279473403645433, "rougeLsum_fmeasure": 0.2126977466698511, "rougeLsum_fmeasure_stderr": 0.00181125814483511, "rougeLsum_precision": 0.18848478692273632, "rougeLsum_precision_stderr": 0.00214414652384963, "rougeLsum_recall": 0.30377969664006194, "rougeLsum_recall_stderr": 0.002697958294911981}}, "3": {"tldr_en": {"bleu": 3.035494832499772, "bleu_stderr": 0.06850441353492724, "rouge1_fmeasure": 0.18313598363551006, "rouge1_fmeasure_stderr": 0.0022391709882410133, "rouge1_precision": 0.1667135139080436, "rouge1_precision_stderr": 0.0024669533830303063, "rouge1_recall": 0.2623617730133377, "rouge1_recall_stderr": 0.003385513168989511, "rouge2_fmeasure": 0.04621392992407233, "rouge2_fmeasure_stderr": 0.001006234346753026, "rouge2_precision": 0.041384258326349704, "rouge2_precision_stderr": 0.0010427841570344117, "rouge2_recall": 0.06930703055665066, "rouge2_recall_stderr": 0.0017169032971841199, "rougeL_fmeasure": 0.13118134843222834, "rougeL_fmeasure_stderr": 0.001566466552562795, "rougeL_precision": 0.11961231876550879, "rougeL_precision_stderr": 0.0018282047181587178, "rougeL_recall": 0.19211636205683727, "rougeL_recall_stderr": 0.002621657231489954, "rougeLsum_fmeasure": 0.1726637198123351, "rougeLsum_fmeasure_stderr": 0.0021129044901164544, "rougeLsum_precision": 0.15711061007841756, "rougeLsum_precision_stderr": 0.002336109911551883, "rougeLsum_recall": 0.24764717051147508, "rougeLsum_recall_stderr": 0.0032128254482269142}}, "4": {"tldr_en": {"bleu": 0.6879634967509892, "bleu_stderr": 0.05472536305592775, "rouge1_fmeasure": 0.058553277982514064, "rouge1_fmeasure_stderr": 0.0019738129408339356, "rouge1_precision": 0.05444587983053853, "rouge1_precision_stderr": 0.0020227050857669765, "rouge1_recall": 0.08825152123419766, "rouge1_recall_stderr": 0.0030681469979664377, "rouge2_fmeasure": 0.01447488452837446, "rouge2_fmeasure_stderr": 0.0006938077925061432, "rouge2_precision": 0.012524269642898123, "rouge2_precision_stderr": 0.000686849807944642, "rouge2_recall": 0.02397487658441744, "rouge2_recall_stderr": 0.0012949404748263044, "rougeL_fmeasure": 0.04341997796831018, "rougeL_fmeasure_stderr": 0.0014430563064785457, "rougeL_precision": 0.040637488345939506, "rougeL_precision_stderr": 0.0015470785896341036, "rougeL_recall": 0.06699885504411017, "rougeL_recall_stderr": 0.0023845584550771827, "rougeLsum_fmeasure": 0.054647276227056785, "rougeLsum_fmeasure_stderr": 0.0018406668038190854, "rougeLsum_precision": 0.0508937923018914, "rougeLsum_precision_stderr": 0.0018965643869497761, "rougeLsum_recall": 0.08258034628183224, "rougeLsum_recall_stderr": 0.0028860425739382135}}, "5": {"tldr_en": {"bleu": 1.6290219613024457e-06, "bleu_stderr": 3.2871456226947453e-06, "rouge1_fmeasure": 0.008923710103600203, "rouge1_fmeasure_stderr": 0.0008383612981602042, "rouge1_precision": 0.008704906428698754, "rouge1_precision_stderr": 0.0008920441235603163, "rouge1_recall": 0.01349584479571889, "rouge1_recall_stderr": 0.0013418727621830133, "rouge2_fmeasure": 0.002282481160760156, "rouge2_fmeasure_stderr": 0.0002837675400767184, "rouge2_precision": 0.002278426982281537, "rouge2_precision_stderr": 0.0003265555907877622, "rouge2_recall": 0.003782541091838121, "rouge2_recall_stderr": 0.0005718078404621888, "rougeL_fmeasure": 0.006749717467180042, "rougeL_fmeasure_stderr": 0.0006335344100807781, "rougeL_precision": 0.006657870575038381, "rougeL_precision_stderr": 0.0007002581534225452, "rougeL_recall": 0.010336845889931697, "rougeL_recall_stderr": 0.0010559957378692645, "rougeLsum_fmeasure": 0.008348399663837202, "rougeLsum_fmeasure_stderr": 0.0007881788366882943, "rougeLsum_precision": 0.008122714973299269, "rougeLsum_precision_stderr": 0.0008382953768487845, "rougeLsum_recall": 0.012766583726132735, "rougeLsum_recall_stderr": 0.0012894453008292465}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.019485516617194824, "bleu_stderr": 0.005217238506779678, "rouge1_fmeasure": 0.12227880995996511, "rouge1_fmeasure_stderr": 0.001227579323062598, "rouge1_precision": 0.30475647058823735, "rouge1_precision_stderr": 0.0029113904574003266, "rouge1_recall": 0.0783105046716935, "rouge1_recall_stderr": 0.0008568947755657402, "rouge2_fmeasure": 0.0067785024654653135, "rouge2_fmeasure_stderr": 0.00037041481990128376, "rouge2_precision": 0.0214605442176872, "rouge2_precision_stderr": 0.0011284720045316163, "rouge2_recall": 0.004184760226845158, "rouge2_recall_stderr": 0.00024513519350090904, "rougeL_fmeasure": 0.11241456856374404, "rougeL_fmeasure_stderr": 0.001162992552966645, "rougeL_precision": 0.28038819551008926, "rougeL_precision_stderr": 0.0027737242398049983, "rougeL_recall": 0.07195050572819697, "rougeL_recall_stderr": 0.0008044984809771059, "rougeLsum_fmeasure": 0.11434829952527656, "rougeLsum_fmeasure_stderr": 0.0011699821874966887, "rougeLsum_precision": 0.2858334811025873, "rougeLsum_precision_stderr": 0.002820718681739719, "rougeLsum_recall": 0.07313529073834715, "rougeLsum_recall_stderr": 0.0008069332308087195}}, "1": {"generate_text_restaurant": {"bleu": 6.1217745777182575, "bleu_stderr": 0.08609952568756107, "rouge1_fmeasure": 0.3495706737183144, "rouge1_fmeasure_stderr": 0.002206663575855576, "rouge1_precision": 0.30339575240317485, "rouge1_precision_stderr": 0.0024133358053892258, "rouge1_recall": 0.4538469080638178, "rouge1_recall_stderr": 0.0028284562621381255, "rouge2_fmeasure": 0.147547294117922, "rouge2_fmeasure_stderr": 0.0014406862441090546, "rouge2_precision": 0.12753416214535832, "rouge2_precision_stderr": 0.0014369458200472643, "rouge2_recall": 0.19457563475831974, "rouge2_recall_stderr": 0.001944145902755319, "rougeL_fmeasure": 0.2452399085427064, "rougeL_fmeasure_stderr": 0.0015529724243686358, "rougeL_precision": 0.2115837416620355, "rougeL_precision_stderr": 0.0016691313965173627, "rougeL_recall": 0.3224869043888079, "rougeL_recall_stderr": 0.0022669452560961922, "rougeLsum_fmeasure": 0.2815539466454405, "rougeLsum_fmeasure_stderr": 0.0020220538608784486, "rougeLsum_precision": 0.24537972591509996, "rougeLsum_precision_stderr": 0.0021741485514156335, "rougeLsum_recall": 0.36390833549974066, "rougeLsum_recall_stderr": 0.0025504695530939943}}, "2": {"generate_text_restaurant": {"bleu": 9.73396081313577, "bleu_stderr": 0.16443895463999308, "rouge1_fmeasure": 0.4347642350301138, "rouge1_fmeasure_stderr": 0.0018294368910635097, "rouge1_precision": 0.42721544875834094, "rouge1_precision_stderr": 0.0022260460097216607, "rouge1_recall": 0.4783644219119815, "rouge1_recall_stderr": 0.0026415240795754575, "rouge2_fmeasure": 0.19477780513184206, "rouge2_fmeasure_stderr": 0.0015230278683889692, "rouge2_precision": 0.1908864712741689, "rouge2_precision_stderr": 0.0016066847355046183, "rouge2_recall": 0.21664624624813425, "rouge2_recall_stderr": 0.0019501314703353825, "rougeL_fmeasure": 0.2843186127016924, "rougeL_fmeasure_stderr": 0.0015850879360423396, "rougeL_precision": 0.2795606159041614, "rougeL_precision_stderr": 0.0018063552605777058, "rougeL_recall": 0.3138203115628376, "rougeL_recall_stderr": 0.0021930939607276148, "rougeLsum_fmeasure": 0.3486631372436859, "rougeLsum_fmeasure_stderr": 0.0018433564125180965, "rougeLsum_precision": 0.34337399673326013, "rougeLsum_precision_stderr": 0.0021406676222103798, "rougeLsum_recall": 0.38322808261829044, "rougeLsum_recall_stderr": 0.0024691669826182147}}, "3": {"generate_text_restaurant": {"bleu": 10.167196723473115, "bleu_stderr": 0.15513189947572925, "rouge1_fmeasure": 0.4411529394994754, "rouge1_fmeasure_stderr": 0.0017983716092168191, "rouge1_precision": 0.43176656379865663, "rouge1_precision_stderr": 0.002170517687370395, "rouge1_recall": 0.48579069977293127, "rouge1_recall_stderr": 0.0026286758820643257, "rouge2_fmeasure": 0.20169349699148847, "rouge2_fmeasure_stderr": 0.0015425455575345005, "rouge2_precision": 0.19684990744521816, "rouge2_precision_stderr": 0.0016075773671781232, "rouge2_recall": 0.22463280727271592, "rouge2_recall_stderr": 0.001992585772028349, "rougeL_fmeasure": 0.2855873952746071, "rougeL_fmeasure_stderr": 0.0016093104735564181, "rougeL_precision": 0.27934212774008493, "rougeL_precision_stderr": 0.0017756405594887574, "rougeL_recall": 0.3156535136223044, "rougeL_recall_stderr": 0.0022241009812821446, "rougeLsum_fmeasure": 0.3549268514496913, "rougeLsum_fmeasure_stderr": 0.0018598173357349634, "rougeLsum_precision": 0.34791134873666746, "rougeLsum_precision_stderr": 0.002115584511773362, "rougeLsum_recall": 0.39032744461240204, "rougeLsum_recall_stderr": 0.002480003023653809}}, "4": {"generate_text_restaurant": {"bleu": 10.274846271832308, "bleu_stderr": 0.17350922889251333, "rouge1_fmeasure": 0.4439018507991703, "rouge1_fmeasure_stderr": 0.0018178168545300278, "rouge1_precision": 0.43432693579703685, "rouge1_precision_stderr": 0.0022023391400501123, "rouge1_recall": 0.48816549378150415, "rouge1_recall_stderr": 0.0026054616364898048, "rouge2_fmeasure": 0.20428378680082976, "rouge2_fmeasure_stderr": 0.0015629130875785963, "rouge2_precision": 0.1997182814210848, "rouge2_precision_stderr": 0.0016866546655237163, "rouge2_recall": 0.22703283947909766, "rouge2_recall_stderr": 0.0019763146011800813, "rougeL_fmeasure": 0.28569394577784957, "rougeL_fmeasure_stderr": 0.0015871438924867972, "rougeL_precision": 0.27937569927134076, "rougeL_precision_stderr": 0.0017835624375802472, "rougeL_recall": 0.3155942964310971, "rougeL_recall_stderr": 0.002192145278497691, "rougeLsum_fmeasure": 0.3567726567471316, "rougeLsum_fmeasure_stderr": 0.0018537019301754461, "rougeLsum_precision": 0.34978440910488817, "rougeLsum_precision_stderr": 0.002136407600334958, "rougeLsum_recall": 0.39186145386005805, "rougeLsum_recall_stderr": 0.0024479650146275824}}, "5": {"generate_text_restaurant": {"bleu": 10.565355740750254, "bleu_stderr": 0.1961364903745894, "rouge1_fmeasure": 0.446874499715884, "rouge1_fmeasure_stderr": 0.001817903581134802, "rouge1_precision": 0.4373138164566645, "rouge1_precision_stderr": 0.0022014766665088845, "rouge1_recall": 0.4901583618959144, "rouge1_recall_stderr": 0.0026018676905927864, "rouge2_fmeasure": 0.20717938591913995, "rouge2_fmeasure_stderr": 0.0015958612952873166, "rouge2_precision": 0.2023686501248562, "rouge2_precision_stderr": 0.0016863287107579494, "rouge2_recall": 0.22966424302105778, "rouge2_recall_stderr": 0.0020208732170635497, "rougeL_fmeasure": 0.2891686579274761, "rougeL_fmeasure_stderr": 0.0016149144474544455, "rougeL_precision": 0.28245426485520925, "rougeL_precision_stderr": 0.0017702500351139657, "rougeL_recall": 0.318919953216165, "rougeL_recall_stderr": 0.0022528135483797333, "rougeLsum_fmeasure": 0.3605130360109026, "rougeLsum_fmeasure_stderr": 0.001901178308997283, "rougeLsum_precision": 0.3532483133538902, "rougeLsum_precision_stderr": 0.002156803938589329, "rougeLsum_recall": 0.3952346715593857, "rougeLsum_recall_stderr": 0.0025157250309490783}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9104162079268645, "bleu_stderr": 0.10745870999086636, "rouge1_fmeasure": 0.2058965935490713, "rouge1_fmeasure_stderr": 0.002562767814599106, "rouge1_precision": 0.15469570847748534, "rouge1_precision_stderr": 0.0022274674043606772, "rouge1_recall": 0.34097755921779777, "rouge1_recall_stderr": 0.004359102304158973, "rouge2_fmeasure": 0.04496383454282395, "rouge2_fmeasure_stderr": 0.001558330026208433, "rouge2_precision": 0.03317453563192352, "rouge2_precision_stderr": 0.0012237431942751298, "rouge2_recall": 0.07790936705927562, "rouge2_recall_stderr": 0.0027781247018954214, "rougeL_fmeasure": 0.1557970094622366, "rougeL_fmeasure_stderr": 0.0019230041807038062, "rougeL_precision": 0.1168534912712974, "rougeL_precision_stderr": 0.001681530452546301, "rougeL_recall": 0.26001944579526626, "rougeL_recall_stderr": 0.0034416951705863856, "rougeLsum_fmeasure": 0.16222120620816466, "rougeLsum_fmeasure_stderr": 0.0021566858128678804, "rougeLsum_precision": 0.12147708257791358, "rougeLsum_precision_stderr": 0.0018137808321746112, "rougeLsum_recall": 0.2708664092088707, "rougeLsum_recall_stderr": 0.003852079388912951}}, "1": {"article_DOC_summary": {"bleu": 1.5040790159066222, "bleu_stderr": 0.09999582617414535, "rouge1_fmeasure": 0.17696559633240494, "rouge1_fmeasure_stderr": 0.0025027575913053445, "rouge1_precision": 0.12570346189174908, "rouge1_precision_stderr": 0.001849614481854689, "rouge1_recall": 0.3111321160565978, "rouge1_recall_stderr": 0.00433405758194808, "rouge2_fmeasure": 0.0356226282886433, "rouge2_fmeasure_stderr": 0.0014552201793596074, "rouge2_precision": 0.025013191562735113, "rouge2_precision_stderr": 0.0010212805241063957, "rouge2_recall": 0.06433749506053808, "rouge2_recall_stderr": 0.0026958973939770112, "rougeL_fmeasure": 0.13897740891589383, "rougeL_fmeasure_stderr": 0.0018840553688145314, "rougeL_precision": 0.09850554078756936, "rougeL_precision_stderr": 0.0013772963054560117, "rougeL_recall": 0.2460276605249957, "rougeL_recall_stderr": 0.003415396525439171, "rougeLsum_fmeasure": 0.14313077367333898, "rougeLsum_fmeasure_stderr": 0.0020950894669794803, "rougeLsum_precision": 0.10141296524752817, "rougeLsum_precision_stderr": 0.0015247810679113252, "rougeLsum_recall": 0.25333331203094245, "rougeLsum_recall_stderr": 0.0037545439637255164}}, "2": {"article_DOC_summary": {"bleu": 1.4929194953290439, "bleu_stderr": 0.08057665423426058, "rouge1_fmeasure": 0.18127274141871888, "rouge1_fmeasure_stderr": 0.00241848797788742, "rouge1_precision": 0.12896293817065393, "rouge1_precision_stderr": 0.001799675080007091, "rouge1_recall": 0.3180959042115626, "rouge1_recall_stderr": 0.004178674908785601, "rouge2_fmeasure": 0.03905458313366822, "rouge2_fmeasure_stderr": 0.0014152906892153957, "rouge2_precision": 0.02742389012914643, "rouge2_precision_stderr": 0.0009928132907657912, "rouge2_recall": 0.07079016379966595, "rouge2_recall_stderr": 0.0026589413773835647, "rougeL_fmeasure": 0.14417274860624746, "rougeL_fmeasure_stderr": 0.0018339933534688294, "rougeL_precision": 0.10236042398062786, "rougeL_precision_stderr": 0.0013484239278273018, "rougeL_recall": 0.2547109497424645, "rougeL_recall_stderr": 0.0033496507772406404, "rougeLsum_fmeasure": 0.14304634120868107, "rougeLsum_fmeasure_stderr": 0.0020090434539940354, "rougeLsum_precision": 0.10149736681249724, "rougeLsum_precision_stderr": 0.0014695078170962588, "rougeLsum_recall": 0.25287427887724734, "rougeLsum_recall_stderr": 0.003615112148066647}}, "3": {"article_DOC_summary": {"bleu": 1.5514732915686686, "bleu_stderr": 0.06584730744603277, "rouge1_fmeasure": 0.1732012657461723, "rouge1_fmeasure_stderr": 0.002583285350713421, "rouge1_precision": 0.12568971932050957, "rouge1_precision_stderr": 0.0019867343449795436, "rouge1_recall": 0.2980568565006806, "rouge1_recall_stderr": 0.0044385971143545925, "rouge2_fmeasure": 0.03732502337389758, "rouge2_fmeasure_stderr": 0.001407382216335427, "rouge2_precision": 0.026556107570647483, "rouge2_precision_stderr": 0.0010036761182530207, "rouge2_recall": 0.0665297919516232, "rouge2_recall_stderr": 0.002602163761890605, "rougeL_fmeasure": 0.13966111034449794, "rougeL_fmeasure_stderr": 0.0020183371393927246, "rougeL_precision": 0.10107040460617824, "rougeL_precision_stderr": 0.0015300738666534682, "rougeL_recall": 0.24182686627170014, "rougeL_recall_stderr": 0.0036053184465759066, "rougeLsum_fmeasure": 0.13757244542943212, "rougeLsum_fmeasure_stderr": 0.0021289239270338984, "rougeLsum_precision": 0.09953672926413436, "rougeLsum_precision_stderr": 0.001608703406903824, "rougeLsum_recall": 0.23846406990945382, "rougeLsum_recall_stderr": 0.0037909803483724158}}, "4": {"article_DOC_summary": {"bleu": 0.7944776696870298, "bleu_stderr": 0.10362442799667403, "rouge1_fmeasure": 0.047969047807786644, "rouge1_fmeasure_stderr": 0.0027049274485574266, "rouge1_precision": 0.040899178032418326, "rouge1_precision_stderr": 0.002605612820038021, "rouge1_recall": 0.07499542166510798, "rouge1_recall_stderr": 0.00428999990716937, "rouge2_fmeasure": 0.01037190909743005, "rouge2_fmeasure_stderr": 0.0009089595677707768, "rouge2_precision": 0.008749245771103371, "rouge2_precision_stderr": 0.0011185040793792623, "rouge2_recall": 0.016994972361551015, "rouge2_recall_stderr": 0.0015279675367425043, "rougeL_fmeasure": 0.03879900170082555, "rougeL_fmeasure_stderr": 0.002173436106104496, "rougeL_precision": 0.03315527531080365, "rougeL_precision_stderr": 0.0021547069470982173, "rougeL_recall": 0.06122545633154963, "rougeL_recall_stderr": 0.0035225934764212467, "rougeLsum_fmeasure": 0.03853554689365841, "rougeLsum_fmeasure_stderr": 0.0022037664641477543, "rougeLsum_precision": 0.033173334214871286, "rougeLsum_precision_stderr": 0.002212477212763894, "rougeLsum_recall": 0.06052111080135351, "rougeLsum_recall_stderr": 0.0035439956690194005}}, "5": {"article_DOC_summary": {"bleu": 3.4941927902195367e-37, "bleu_stderr": 2.0537455298693754e-31, "rouge1_fmeasure": 0.002113857458322312, "rouge1_fmeasure_stderr": 0.0005894640261231909, "rouge1_precision": 0.002322978708500643, "rouge1_precision_stderr": 0.0006607374218254165, "rouge1_recall": 0.002090745619722155, "rouge1_recall_stderr": 0.0006058929846745543, "rouge2_fmeasure": 0.00023488994329474606, "rouge2_fmeasure_stderr": 0.0001655661745735824, "rouge2_precision": 0.00024103409231045192, "rouge2_precision_stderr": 0.0001588035816140835, "rouge2_recall": 0.00023789103977783224, "rouge2_recall_stderr": 0.0001778063536507226, "rougeL_fmeasure": 0.0016968098081768105, "rougeL_fmeasure_stderr": 0.00047227159050400433, "rougeL_precision": 0.0018378127785073283, "rougeL_precision_stderr": 0.0005100015743444681, "rougeL_recall": 0.0017163875500295284, "rougeL_recall_stderr": 0.0005184570317872023, "rougeLsum_fmeasure": 0.0017722881417479145, "rougeLsum_fmeasure_stderr": 0.0004927412393347486, "rougeLsum_precision": 0.001930597627995256, "rougeLsum_precision_stderr": 0.0005384988094450386, "rougeLsum_recall": 0.0017802664167711148, "rougeLsum_recall_stderr": 0.0005319571788963744}}}} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..579e38cf9e15b6b894d02625a5c2592b1ce01a27 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.014794927843348635,0 +anli_r2,acc,0.342,0.015008706182121728,0 +anli_r3,acc,0.3516666666666667,0.013789711695404785,0 +arc_challenge,acc,0.2781569965870307,0.013094469919538805,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907586,0 +arc_easy,acc,0.6098484848484849,0.010009118166667412,0 +arc_easy,acc_norm,0.531986531986532,0.010238767643185714,0 +boolq,acc,0.5299694189602446,0.008729331818314893,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.3082010582010582,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.48157737502489545,0.004986393266269166,0 +hellaswag,acc_norm,0.6338378809002191,0.004807699539973428,0 +piqa,acc,0.7546245919477693,0.010039831320422396,0 +piqa,acc_norm,0.766050054406964,0.00987723689513744,0 +rte,acc,0.5342960288808665,0.030025579819366426,0 +sciq,acc,0.874,0.010499249222408032,0 +sciq,acc_norm,0.772,0.013273740700804471,0 +storycloze_2016,acc,0.7215392838054516,0.010365521460604413,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0_lm-eval_global_step80108_2023-02-24-21-45-58_0shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0_lm-eval_global_step80108_2023-02-24-21-45-58_0shots_backup.json deleted file mode 100644 index cba1aa7ad5b982a45fa2127ac29e96320136c6a0..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_0_lm-eval_global_step80108_2023-02-24-21-45-58_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.014794927843348635 - }, - "anli_r2": { - "acc": 0.342, - "acc_stderr": 0.015008706182121728 - }, - "anli_r3": { - "acc": 0.3516666666666667, - "acc_stderr": 0.013789711695404785 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.3082010582010582 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.48157737502489545, - "acc_stderr": 0.004986393266269166, - "acc_norm": 0.6338378809002191, - "acc_norm_stderr": 0.004807699539973428 - }, - "rte": { - "acc": 0.5342960288808665, - "acc_stderr": 0.030025579819366426 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7215392838054516, - "acc_stderr": 0.010365521460604413 - }, - "boolq": { - "acc": 0.5299694189602446, - "acc_stderr": 0.008729331818314893 - }, - "arc_easy": { - "acc": 0.6098484848484849, - "acc_stderr": 0.010009118166667412, - "acc_norm": 0.531986531986532, - "acc_norm_stderr": 0.010238767643185714 - }, - "arc_challenge": { - "acc": 0.2781569965870307, - "acc_stderr": 0.013094469919538805, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907586 - }, - "sciq": { - "acc": 0.874, - "acc_stderr": 0.010499249222408032, - "acc_norm": 0.772, - "acc_norm_stderr": 0.013273740700804471 - }, - "piqa": { - "acc": 0.7546245919477693, - "acc_stderr": 0.010039831320422396, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.00987723689513744 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..c729ad02ba03f3defa870b3991349c09c405050b --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.337,0.014955087918653591,0 +anli_r2,acc,0.324,0.014806864733738854,0 +anli_r3,acc,0.3416666666666667,0.013696658778002515,0 +arc_challenge,acc,0.28668941979522183,0.01321498632927479,0 +arc_challenge,acc_norm,0.31399317406143346,0.013562691224726298,0 +arc_easy,acc,0.6123737373737373,0.009997307914447612,0 +arc_easy,acc_norm,0.5778619528619529,0.01013462052459227,0 +boolq,acc,0.5889908256880734,0.008605429733982185,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3235294117647059,,1 +copa,acc,0.78,0.041633319989322626,0 +hellaswag,acc,0.47849034056960765,0.004985162074336111,0 +hellaswag,acc_norm,0.6335391356303525,0.0048085268027185865,0 +piqa,acc,0.7540805223068553,0.010047331865625191,0 +piqa,acc_norm,0.7573449401523396,0.01000200256970869,0 +rte,acc,0.5523465703971119,0.029931070362939533,0 +sciq,acc,0.898,0.009575368801653885,0 +sciq,acc_norm,0.89,0.009899393819724442,0 +storycloze_2016,acc,0.7188669160876536,0.010395836091628105,0 +winogrande,acc,0.585635359116022,0.013844846232268563,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json deleted file mode 100644 index 724b51f510919f09cc352b25524106a4fee95378..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_1_lm-eval_global_step80108_2023-02-24-21-45-54_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.337, - "acc_stderr": 0.014955087918653591 - }, - "anli_r2": { - "acc": 0.324, - "acc_stderr": 0.014806864733738854 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002515 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3235294117647059 - }, - "copa": { - "acc": 0.78, - "acc_stderr": 0.041633319989322626 - }, - "hellaswag": { - "acc": 0.47849034056960765, - "acc_stderr": 0.004985162074336111, - "acc_norm": 0.6335391356303525, - "acc_norm_stderr": 0.0048085268027185865 - }, - "rte": { - "acc": 0.5523465703971119, - "acc_stderr": 0.029931070362939533 - }, - "winogrande": { - "acc": 0.585635359116022, - "acc_stderr": 0.013844846232268563 - }, - "storycloze_2016": { - "acc": 0.7188669160876536, - "acc_stderr": 0.010395836091628105 - }, - "boolq": { - "acc": 0.5889908256880734, - "acc_stderr": 0.008605429733982185 - }, - "arc_easy": { - "acc": 0.6123737373737373, - "acc_stderr": 0.009997307914447612, - "acc_norm": 0.5778619528619529, - "acc_norm_stderr": 0.01013462052459227 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.01321498632927479, - "acc_norm": 0.31399317406143346, - "acc_norm_stderr": 0.013562691224726298 - }, - "sciq": { - "acc": 0.898, - "acc_stderr": 0.009575368801653885, - "acc_norm": 0.89, - "acc_norm_stderr": 0.009899393819724442 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.010047331865625191, - "acc_norm": 0.7573449401523396, - "acc_norm_stderr": 0.01000200256970869 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..e6ac442c53298b86bb7c56161deea15105e1ddc3 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.33,0.01487687202745673,0 +anli_r2,acc,0.323,0.014794927843348635,0 +anli_r3,acc,0.32916666666666666,0.013570806258433635,0 +arc_challenge,acc,0.2901023890784983,0.013261573677520764,0 +arc_challenge,acc_norm,0.31143344709897613,0.013532472099850945,0 +arc_easy,acc,0.6212121212121212,0.009953737656542035,0 +arc_easy,acc_norm,0.6069023569023569,0.010022540618945315,0 +boolq,acc,0.6097859327217126,0.00853164352626324,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.33962264150943394,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.47759410476000796,0.004984768912326931,0 +hellaswag,acc_norm,0.6341366261700856,0.004806870285747299,0 +piqa,acc,0.7480957562568009,0.010128421335088683,0 +piqa,acc_norm,0.7693144722524483,0.009828959550983096,0 +rte,acc,0.51985559566787,0.030072723167317184,0 +sciq,acc,0.916,0.008776162089491122,0 +sciq,acc_norm,0.899,0.009533618929340988,0 +storycloze_2016,acc,0.72367717797969,0.010340939873166824,0 +winogrande,acc,0.5887924230465666,0.01382912835867687,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json deleted file mode 100644 index 94b3bb570177d9b9d6318464d032e4e6dcfbce4c..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_2_lm-eval_global_step80108_2023-02-24-21-45-54_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.33, - "acc_stderr": 0.01487687202745673 - }, - "anli_r2": { - "acc": 0.323, - "acc_stderr": 0.014794927843348635 - }, - "anli_r3": { - "acc": 0.32916666666666666, - "acc_stderr": 0.013570806258433635 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.33962264150943394 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.47759410476000796, - "acc_stderr": 0.004984768912326931, - "acc_norm": 0.6341366261700856, - "acc_norm_stderr": 0.004806870285747299 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317184 - }, - "winogrande": { - "acc": 0.5887924230465666, - "acc_stderr": 0.01382912835867687 - }, - "storycloze_2016": { - "acc": 0.72367717797969, - "acc_stderr": 0.010340939873166824 - }, - "boolq": { - "acc": 0.6097859327217126, - "acc_stderr": 0.00853164352626324 - }, - "arc_easy": { - "acc": 0.6212121212121212, - "acc_stderr": 0.009953737656542035, - "acc_norm": 0.6069023569023569, - "acc_norm_stderr": 0.010022540618945315 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.013261573677520764, - "acc_norm": 0.31143344709897613, - "acc_norm_stderr": 0.013532472099850945 - }, - "sciq": { - "acc": 0.916, - "acc_stderr": 0.008776162089491122, - "acc_norm": 0.899, - "acc_norm_stderr": 0.009533618929340988 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088683, - "acc_norm": 0.7693144722524483, - "acc_norm_stderr": 0.009828959550983096 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..42a2c8d9a61ace1b5a8d5ab371d18ae6091bb0c6 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.307,0.014593284892852634,0 +anli_r2,acc,0.329,0.014865395385928366,0 +anli_r3,acc,0.3616666666666667,0.013876131663123877,0 +arc_challenge,acc,0.2790102389078498,0.01310678488360133,0 +arc_challenge,acc_norm,0.2986348122866894,0.013374078615068745,0 +arc_easy,acc,0.6212121212121212,0.009953737656542037,0 +arc_easy,acc_norm,0.6077441077441077,0.010018744689650043,0 +boolq,acc,0.6128440366972477,0.008519429207594416,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.28585166486356534,,1 +copa,acc,0.83,0.037752516806863715,0 +hellaswag,acc,0.4790878311093408,0.0049854152506909125,0 +hellaswag,acc_norm,0.6355307707627963,0.004802974070507194,0 +piqa,acc,0.7524483133841132,0.010069703966857102,0 +piqa,acc_norm,0.764961915125136,0.00989314668880531,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.916,0.008776162089491115,0 +sciq,acc_norm,0.905,0.009276910103103298,0 +storycloze_2016,acc,0.72367717797969,0.010340939873166824,0 +winogrande,acc,0.5872138910812944,0.01383706064868209,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json deleted file mode 100644 index 247ec8e20118a7e1bd45401fd6e02b3762c7f533..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_3_lm-eval_global_step80108_2023-02-24-21-45-54_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.307, - "acc_stderr": 0.014593284892852634 - }, - "anli_r2": { - "acc": 0.329, - "acc_stderr": 0.014865395385928366 - }, - "anli_r3": { - "acc": 0.3616666666666667, - "acc_stderr": 0.013876131663123877 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.28585166486356534 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.037752516806863715 - }, - "hellaswag": { - "acc": 0.4790878311093408, - "acc_stderr": 0.0049854152506909125, - "acc_norm": 0.6355307707627963, - "acc_norm_stderr": 0.004802974070507194 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5872138910812944, - "acc_stderr": 0.01383706064868209 - }, - "storycloze_2016": { - "acc": 0.72367717797969, - "acc_stderr": 0.010340939873166824 - }, - "boolq": { - "acc": 0.6128440366972477, - "acc_stderr": 0.008519429207594416 - }, - "arc_easy": { - "acc": 0.6212121212121212, - "acc_stderr": 0.009953737656542037, - "acc_norm": 0.6077441077441077, - "acc_norm_stderr": 0.010018744689650043 - }, - "arc_challenge": { - "acc": 0.2790102389078498, - "acc_stderr": 0.01310678488360133, - "acc_norm": 0.2986348122866894, - "acc_norm_stderr": 0.013374078615068745 - }, - "sciq": { - "acc": 0.916, - "acc_stderr": 0.008776162089491115, - "acc_norm": 0.905, - "acc_norm_stderr": 0.009276910103103298 - }, - "piqa": { - "acc": 0.7524483133841132, - "acc_stderr": 0.010069703966857102, - "acc_norm": 0.764961915125136, - "acc_norm_stderr": 0.00989314668880531 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..6aa993749fc1ed51444b8348546829e0d662d826 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.325,0.014818724459095526,0 +anli_r2,acc,0.347,0.015060472031706618,0 +anli_r3,acc,0.35333333333333333,0.013804572162314926,0 +arc_challenge,acc,0.28498293515358364,0.013191348179838795,0 +arc_challenge,acc_norm,0.3165529010238908,0.01359243151906808,0 +arc_easy,acc,0.6224747474747475,0.00994722783346943,0 +arc_easy,acc_norm,0.6060606060606061,0.010026305355981821,0 +boolq,acc,0.6207951070336392,0.008486012137246288,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.34098360655737703,,1 +copa,acc,0.83,0.037752516806863715,0 +hellaswag,acc,0.4778928500298745,0.004984901752846394,0 +hellaswag,acc_norm,0.6368253335988847,0.004799317209902008,0 +piqa,acc,0.7486398258977149,0.010121156016819257,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.5126353790613718,0.030086851767188564,0 +sciq,acc,0.925,0.00833333333333337,0 +sciq,acc_norm,0.913,0.008916866630745911,0 +storycloze_2016,acc,0.7311598075895244,0.010252563090396082,0 +winogrande,acc,0.595895816890292,0.013791610664670856,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json deleted file mode 100644 index c12d2343330e602817476424e2219105c0aebd74..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_4_lm-eval_global_step80108_2023-02-24-21-45-54_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.325, - "acc_stderr": 0.014818724459095526 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706618 - }, - "anli_r3": { - "acc": 0.35333333333333333, - "acc_stderr": 0.013804572162314926 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.34098360655737703 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.037752516806863715 - }, - "hellaswag": { - "acc": 0.4778928500298745, - "acc_stderr": 0.004984901752846394, - "acc_norm": 0.6368253335988847, - "acc_norm_stderr": 0.004799317209902008 - }, - "rte": { - "acc": 0.5126353790613718, - "acc_stderr": 0.030086851767188564 - }, - "winogrande": { - "acc": 0.595895816890292, - "acc_stderr": 0.013791610664670856 - }, - "storycloze_2016": { - "acc": 0.7311598075895244, - "acc_stderr": 0.010252563090396082 - }, - "boolq": { - "acc": 0.6207951070336392, - "acc_stderr": 0.008486012137246288 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.00994722783346943, - "acc_norm": 0.6060606060606061, - "acc_norm_stderr": 0.010026305355981821 - }, - "arc_challenge": { - "acc": 0.28498293515358364, - "acc_stderr": 0.013191348179838795, - "acc_norm": 0.3165529010238908, - "acc_norm_stderr": 0.01359243151906808 - }, - "sciq": { - "acc": 0.925, - "acc_stderr": 0.00833333333333337, - "acc_norm": 0.913, - "acc_norm_stderr": 0.008916866630745911 - }, - "piqa": { - "acc": 0.7486398258977149, - "acc_stderr": 0.010121156016819257, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5.csv b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..86e08cdbefb9e55afacbc1837c0800ad20321546 --- /dev/null +++ b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.01485384248727033,0 +anli_r2,acc,0.334,0.01492201952373296,0 +anli_r3,acc,0.34833333333333333,0.013759437498874077,0 +arc_challenge,acc,0.2901023890784983,0.01326157367752076,0 +arc_challenge,acc_norm,0.310580204778157,0.013522292098053052,0 +arc_easy,acc,0.6262626262626263,0.009927267058259628,0 +arc_easy,acc_norm,0.6174242424242424,0.009972837790531479,0 +boolq,acc,0.6201834862385321,0.008488668235778617,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.34877192982456134,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.4805815574586736,0.00498601693867853,0 +hellaswag,acc_norm,0.6412069308902609,0.004786660691181924,0 +piqa,acc,0.7480957562568009,0.010128421335088681,0 +piqa,acc_norm,0.7714907508161044,0.00979631351182951,0 +rte,acc,0.5270758122743683,0.030052303463143706,0 +sciq,acc,0.923,0.008434580140240662,0 +sciq,acc_norm,0.915,0.008823426366942314,0 +storycloze_2016,acc,0.7306253340459647,0.010258997754057014,0 +winogrande,acc,0.5895816890292028,0.013825107120035861,0 diff --git a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json b/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json deleted file mode 100644 index e541bb3ff3d05448624f964c8352e0d182fa935f..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed1/evaluation/rankeval/4b284b84bc4v2seed1_5_lm-eval_global_step80108_2023-02-24-21-45-54_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.01485384248727033 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.01492201952373296 - }, - "anli_r3": { - "acc": 0.34833333333333333, - "acc_stderr": 0.013759437498874077 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.34877192982456134 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.4805815574586736, - "acc_stderr": 0.00498601693867853, - "acc_norm": 0.6412069308902609, - "acc_norm_stderr": 0.004786660691181924 - }, - "rte": { - "acc": 0.5270758122743683, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.5895816890292028, - "acc_stderr": 0.013825107120035861 - }, - "storycloze_2016": { - "acc": 0.7306253340459647, - "acc_stderr": 0.010258997754057014 - }, - "boolq": { - "acc": 0.6201834862385321, - "acc_stderr": 0.008488668235778617 - }, - "arc_easy": { - "acc": 0.6262626262626263, - "acc_stderr": 0.009927267058259628, - "acc_norm": 0.6174242424242424, - "acc_norm_stderr": 0.009972837790531479 - }, - "arc_challenge": { - "acc": 0.2901023890784983, - "acc_stderr": 0.01326157367752076, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.013522292098053052 - }, - "sciq": { - "acc": 0.923, - "acc_stderr": 0.008434580140240662, - "acc_norm": 0.915, - "acc_norm_stderr": 0.008823426366942314 - }, - "piqa": { - "acc": 0.7480957562568009, - "acc_stderr": 0.010128421335088681, - "acc_norm": 0.7714907508161044, - "acc_norm_stderr": 0.00979631351182951 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/generation/merged.csv b/4b284b84bc4v2seed2/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d6a5accc49baf2d22ce2b06146434a3e5cca83d5 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.020631873410295806 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.020631873410295806 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.19998119743920914 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.19998119743920914 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23012493273504447 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23012493273504447 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23283631982568367 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23283631982568367 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23219088772710936 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23219088772710936 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2335596273503415 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2335596273503415 +e2e_nlg_cleaned,5,average,multiple,0.19155413974794733 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.05045080374949566 +gem_xsum,0,median,rouge2_fmeasure,0.05045080374949566 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04465858992847766 +gem_xsum,1,median,rouge2_fmeasure,0.04465858992847766 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.046514407795467595 +gem_xsum,2,median,rouge2_fmeasure,0.046514407795467595 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.043981143624385655 +gem_xsum,3,median,rouge2_fmeasure,0.043981143624385655 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01307647569779615 +gem_xsum,4,median,rouge2_fmeasure,0.01307647569779615 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0005678964868499288 +gem_xsum,5,median,rouge2_fmeasure,0.0005678964868499288 +gem_xsum,5,average,multiple,0.033208219547078775 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0554093381893294 +web_nlg_en,0,median,rouge2_fmeasure,0.0554093381893294 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.059755176813484405 +web_nlg_en,1,median,rouge2_fmeasure,0.059755176813484405 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05732608605259917 +web_nlg_en,2,median,rouge2_fmeasure,0.05732608605259917 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.060003612710909315 +web_nlg_en,3,median,rouge2_fmeasure,0.060003612710909315 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.058623697481614896 +web_nlg_en,4,median,rouge2_fmeasure,0.058623697481614896 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05915409533137278 +web_nlg_en,5,median,rouge2_fmeasure,0.05915409533137278 +web_nlg_en,5,average,multiple,0.05837866776321833 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03547451018256482 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03547451018256482 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05894457560206053 +wiki_lingua_en,1,median,rouge2_fmeasure,0.05894457560206053 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.06308783765355935 +wiki_lingua_en,2,median,rouge2_fmeasure,0.06308783765355935 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.051285956226699604 +wiki_lingua_en,3,median,rouge2_fmeasure,0.051285956226699604 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015912520231271727 +wiki_lingua_en,4,median,rouge2_fmeasure,0.015912520231271727 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002459973367560091 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002459973367560091 +wiki_lingua_en,5,average,multiple,0.03786089554395269 diff --git a/4b284b84bc4v2seed2/evaluation/generation/merged.json b/4b284b84bc4v2seed2/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..cc6181ec2a7332cb91e1cc8d12e9d75b33ed3fd6 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4561902687825146, "bleu_stderr": 0.045631246828587047, "rouge1_fmeasure": 0.11679844154672582, "rouge1_fmeasure_stderr": 0.0019568915949739994, "rouge1_precision": 0.07625749807589129, "rouge1_precision_stderr": 0.0015146545254109735, "rouge1_recall": 0.3307426476163128, "rouge1_recall_stderr": 0.00456779888031621, "rouge2_fmeasure": 0.0554093381893294, "rouge2_fmeasure_stderr": 0.0012327294395014525, "rouge2_precision": 0.03608151986203413, "rouge2_precision_stderr": 0.0009254470422146193, "rouge2_recall": 0.16229717802554836, "rouge2_recall_stderr": 0.0032223185692842058, "rougeL_fmeasure": 0.11311445429105703, "rougeL_fmeasure_stderr": 0.0018346474985473178, "rougeL_precision": 0.07353494344552397, "rougeL_precision_stderr": 0.0013741278529976802, "rougeL_recall": 0.3230730872507571, "rougeL_recall_stderr": 0.004482961394584532, "rougeLsum_fmeasure": 0.11188332112192871, "rougeLsum_fmeasure_stderr": 0.0018465577738166203, "rougeLsum_precision": 0.07303744482119352, "rougeLsum_precision_stderr": 0.0014379306638165714, "rougeLsum_recall": 0.31705954495780236, "rougeLsum_recall_stderr": 0.004318803443289957}}, "1": {"PALM_prompt": {"bleu": 0.5886516458334025, "bleu_stderr": 0.03717726878870121, "rouge1_fmeasure": 0.12548839335267944, "rouge1_fmeasure_stderr": 0.001928881826547231, "rouge1_precision": 0.08026566053193719, "rouge1_precision_stderr": 0.0014159639116614112, "rouge1_recall": 0.3942157437513017, "rouge1_recall_stderr": 0.005461079688404384, "rouge2_fmeasure": 0.059755176813484405, "rouge2_fmeasure_stderr": 0.0012347079912433854, "rouge2_precision": 0.038000879190334146, "rouge2_precision_stderr": 0.0008787103935280456, "rouge2_recall": 0.19923195745941746, "rouge2_recall_stderr": 0.003907735021234661, "rougeL_fmeasure": 0.1183744839063624, "rougeL_fmeasure_stderr": 0.0016906391136317978, "rougeL_precision": 0.07552316506586856, "rougeL_precision_stderr": 0.00123012987736281, "rougeL_recall": 0.3738994305484677, "rougeL_recall_stderr": 0.005048426209720286, "rougeLsum_fmeasure": 0.11887135903817798, "rougeLsum_fmeasure_stderr": 0.0017838888979302661, "rougeLsum_precision": 0.07603490019116672, "rougeLsum_precision_stderr": 0.001311880863082997, "rougeLsum_recall": 0.37313363669466976, "rougeLsum_recall_stderr": 0.004995079076932912}}, "2": {"PALM_prompt": {"bleu": 0.6600722092729007, "bleu_stderr": 0.03519197756328506, "rouge1_fmeasure": 0.1228359376509253, "rouge1_fmeasure_stderr": 0.0017091878261403075, "rouge1_precision": 0.07745497341270637, "rouge1_precision_stderr": 0.0012346921599162308, "rouge1_recall": 0.41190076505770906, "rouge1_recall_stderr": 0.005377825981214025, "rouge2_fmeasure": 0.05732608605259917, "rouge2_fmeasure_stderr": 0.0011019893164004922, "rouge2_precision": 0.035903159669296886, "rouge2_precision_stderr": 0.0007708405381290396, "rouge2_recall": 0.20783156644312073, "rouge2_recall_stderr": 0.0039023098470249125, "rougeL_fmeasure": 0.11414361148332351, "rougeL_fmeasure_stderr": 0.0015117695102111128, "rougeL_precision": 0.07197458828228794, "rougeL_precision_stderr": 0.001092373974273461, "rougeL_recall": 0.3823575589838747, "rougeL_recall_stderr": 0.004836437439878737, "rougeLsum_fmeasure": 0.11649612591713067, "rougeLsum_fmeasure_stderr": 0.0016039234092463776, "rougeLsum_precision": 0.07349628109207673, "rougeLsum_precision_stderr": 0.0011632716540392056, "rougeLsum_recall": 0.3899929430530594, "rougeLsum_recall_stderr": 0.004966781612279243}}, "3": {"PALM_prompt": {"bleu": 0.7938690838487434, "bleu_stderr": 0.031249006432413233, "rouge1_fmeasure": 0.12765986861457063, "rouge1_fmeasure_stderr": 0.0017842279285890702, "rouge1_precision": 0.0802242746933909, "rouge1_precision_stderr": 0.0012915264518391, "rouge1_recall": 0.432425410033066, "rouge1_recall_stderr": 0.005388624217576616, "rouge2_fmeasure": 0.060003612710909315, "rouge2_fmeasure_stderr": 0.001146664477215343, "rouge2_precision": 0.03742102127943016, "rouge2_precision_stderr": 0.0007953067755003365, "rouge2_recall": 0.22000051519312314, "rouge2_recall_stderr": 0.00404703168782795, "rougeL_fmeasure": 0.11660022195969111, "rougeL_fmeasure_stderr": 0.0015332520691398934, "rougeL_precision": 0.07327312156062743, "rougeL_precision_stderr": 0.0011113752501149396, "rougeL_recall": 0.39487814553889183, "rougeL_recall_stderr": 0.004737789640173793, "rougeLsum_fmeasure": 0.12088469182871771, "rougeLsum_fmeasure_stderr": 0.001661523804555607, "rougeLsum_precision": 0.07598267078545315, "rougeLsum_precision_stderr": 0.001204590796246913, "rougeLsum_recall": 0.4089805013247068, "rougeLsum_recall_stderr": 0.004967802237281888}}, "4": {"PALM_prompt": {"bleu": 0.8218123055098046, "bleu_stderr": 0.044246313732840256, "rouge1_fmeasure": 0.12410661735510435, "rouge1_fmeasure_stderr": 0.001658918514736671, "rouge1_precision": 0.07753724548278816, "rouge1_precision_stderr": 0.0011900746777565759, "rouge1_recall": 0.4341910857165979, "rouge1_recall_stderr": 0.005446504875155181, "rouge2_fmeasure": 0.058623697481614896, "rouge2_fmeasure_stderr": 0.0010716614020870505, "rouge2_precision": 0.03630893474019336, "rouge2_precision_stderr": 0.0007351259548705939, "rouge2_recall": 0.2243388369397611, "rouge2_recall_stderr": 0.004106316366094987, "rougeL_fmeasure": 0.11289953851936033, "rougeL_fmeasure_stderr": 0.0014402500630831217, "rougeL_precision": 0.07062282112602274, "rougeL_precision_stderr": 0.0010428980489891116, "rougeL_recall": 0.3935550435860648, "rougeL_recall_stderr": 0.004760493217032566, "rougeLsum_fmeasure": 0.1172028966984663, "rougeLsum_fmeasure_stderr": 0.001554404781749481, "rougeLsum_precision": 0.07330274436223451, "rougeLsum_precision_stderr": 0.0011220316964088218, "rougeLsum_recall": 0.4085243199495146, "rougeLsum_recall_stderr": 0.0049771427706643894}}, "5": {"PALM_prompt": {"bleu": 0.913925260231582, "bleu_stderr": 0.051763978663194966, "rouge1_fmeasure": 0.12641834793897835, "rouge1_fmeasure_stderr": 0.0016320788274985598, "rouge1_precision": 0.07874146764038398, "rouge1_precision_stderr": 0.0012134560937164121, "rouge1_recall": 0.45109520601285813, "rouge1_recall_stderr": 0.005553088446218645, "rouge2_fmeasure": 0.05915409533137278, "rouge2_fmeasure_stderr": 0.0010512753815705362, "rouge2_precision": 0.03656608773889105, "rouge2_precision_stderr": 0.000760846094550187, "rouge2_recall": 0.23230496648757598, "rouge2_recall_stderr": 0.004227400481066823, "rougeL_fmeasure": 0.11387146771889235, "rougeL_fmeasure_stderr": 0.0013950097533787038, "rougeL_precision": 0.07104014696908037, "rougeL_precision_stderr": 0.0010444495191682013, "rougeL_recall": 0.40599373509139014, "rougeL_recall_stderr": 0.00482831666482983, "rougeLsum_fmeasure": 0.11913823061282955, "rougeLsum_fmeasure_stderr": 0.0015199756969136377, "rougeLsum_precision": 0.07423831821118593, "rougeLsum_precision_stderr": 0.0011248214939780865, "rougeLsum_recall": 0.424873549364599, "rougeLsum_recall_stderr": 0.005112245054065837}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6235229126061184, "bleu_stderr": 0.07158856142385593, "rouge1_fmeasure": 0.1745670903777373, "rouge1_fmeasure_stderr": 0.0019234371005345781, "rouge1_precision": 0.1490543596959789, "rouge1_precision_stderr": 0.0019546229536216812, "rouge1_recall": 0.25389546448233424, "rouge1_recall_stderr": 0.002741124482183061, "rouge2_fmeasure": 0.03547451018256482, "rouge2_fmeasure_stderr": 0.0008627729378350705, "rouge2_precision": 0.03016441545474279, "rouge2_precision_stderr": 0.0007719528814370501, "rouge2_recall": 0.053419440384246994, "rouge2_recall_stderr": 0.0014288658559341756, "rougeL_fmeasure": 0.13785667531218, "rougeL_fmeasure_stderr": 0.0013936984378312576, "rougeL_precision": 0.11627822506127969, "rougeL_precision_stderr": 0.0013862347099417346, "rougeL_recall": 0.2054884045718469, "rougeL_recall_stderr": 0.002244877757105672, "rougeLsum_fmeasure": 0.16028302243607997, "rougeLsum_fmeasure_stderr": 0.0017474929334903653, "rougeLsum_precision": 0.13663769110467477, "rougeLsum_precision_stderr": 0.0017798375104242446, "rougeLsum_recall": 0.23440517613484713, "rougeLsum_recall_stderr": 0.0025460475763855804}}, "1": {"tldr_en": {"bleu": 3.148014517976192, "bleu_stderr": 0.08632853762954663, "rouge1_fmeasure": 0.22808328022499394, "rouge1_fmeasure_stderr": 0.0020320515676967767, "rouge1_precision": 0.22331912988940492, "rouge1_precision_stderr": 0.0027687942933625823, "rouge1_recall": 0.3105879084943194, "rouge1_recall_stderr": 0.0029250201802578303, "rouge2_fmeasure": 0.05894457560206053, "rouge2_fmeasure_stderr": 0.001148762569744268, "rouge2_precision": 0.06014163619235408, "rouge2_precision_stderr": 0.001549861456149217, "rouge2_recall": 0.08133586965727622, "rouge2_recall_stderr": 0.0017304100412085226, "rougeL_fmeasure": 0.16440728290895265, "rougeL_fmeasure_stderr": 0.001417667346489937, "rougeL_precision": 0.16233563631626566, "rougeL_precision_stderr": 0.0021600975326785993, "rougeL_recall": 0.22817247618168887, "rougeL_recall_stderr": 0.0023092419967920526, "rougeLsum_fmeasure": 0.2138081154409409, "rougeLsum_fmeasure_stderr": 0.0019073843040340796, "rougeLsum_precision": 0.2095322539144404, "rougeLsum_precision_stderr": 0.002623522761491815, "rougeLsum_recall": 0.29186247430489015, "rougeLsum_recall_stderr": 0.0027828596792989963}}, "2": {"tldr_en": {"bleu": 3.648405351452855, "bleu_stderr": 0.12713119304572523, "rouge1_fmeasure": 0.23269927040845031, "rouge1_fmeasure_stderr": 0.0020394436744448855, "rouge1_precision": 0.25284242615549224, "rouge1_precision_stderr": 0.003346416305554945, "rouge1_recall": 0.2985206799819407, "rouge1_recall_stderr": 0.002872417745008737, "rouge2_fmeasure": 0.06308783765355935, "rouge2_fmeasure_stderr": 0.0012452038663643937, "rouge2_precision": 0.07352861081692932, "rouge2_precision_stderr": 0.001975577012025009, "rouge2_recall": 0.08045188272326337, "rouge2_recall_stderr": 0.001687564851802267, "rougeL_fmeasure": 0.1711468063140619, "rougeL_fmeasure_stderr": 0.0015216107571068406, "rougeL_precision": 0.18896530779163317, "rougeL_precision_stderr": 0.0027402813839447243, "rougeL_recall": 0.22165381331302203, "rougeL_recall_stderr": 0.0022793478208025125, "rougeLsum_fmeasure": 0.2196667775453629, "rougeLsum_fmeasure_stderr": 0.0019303318209296886, "rougeLsum_precision": 0.23906918273035027, "rougeLsum_precision_stderr": 0.003206715750850772, "rougeLsum_recall": 0.2821026125213223, "rougeLsum_recall_stderr": 0.0027371552638994733}}, "3": {"tldr_en": {"bleu": 3.6829199554261502, "bleu_stderr": 0.08848427999980182, "rouge1_fmeasure": 0.18860588124930222, "rouge1_fmeasure_stderr": 0.002411594336470169, "rouge1_precision": 0.21412240705416308, "rouge1_precision_stderr": 0.003573906850229763, "rouge1_recall": 0.2363591035613763, "rouge1_recall_stderr": 0.0032427476384113535, "rouge2_fmeasure": 0.051285956226699604, "rouge2_fmeasure_stderr": 0.0012327125521204533, "rouge2_precision": 0.0620187002238152, "rouge2_precision_stderr": 0.001908345830156357, "rouge2_recall": 0.0644931653305227, "rouge2_recall_stderr": 0.001637056221909264, "rougeL_fmeasure": 0.14052920532911936, "rougeL_fmeasure_stderr": 0.0018198320452371584, "rougeL_precision": 0.1623855639995828, "rougeL_precision_stderr": 0.002892414171534976, "rougeL_recall": 0.17796581661301386, "rougeL_recall_stderr": 0.0025623361467453773, "rougeLsum_fmeasure": 0.17844826504839648, "rougeLsum_fmeasure_stderr": 0.0022785596409348953, "rougeLsum_precision": 0.20310628745671855, "rougeLsum_precision_stderr": 0.0034230022998127862, "rougeLsum_recall": 0.22379283876686262, "rougeLsum_recall_stderr": 0.0030762140186834028}}, "4": {"tldr_en": {"bleu": 0.4977010788321425, "bleu_stderr": 0.029197504340406147, "rouge1_fmeasure": 0.058680633472697495, "rouge1_fmeasure_stderr": 0.0020527112852523396, "rouge1_precision": 0.06596988964164227, "rouge1_precision_stderr": 0.0026452643433214004, "rouge1_recall": 0.07786694962495821, "rouge1_recall_stderr": 0.002793778795842796, "rouge2_fmeasure": 0.015912520231271727, "rouge2_fmeasure_stderr": 0.0008257625671288616, "rouge2_precision": 0.018662630886825272, "rouge2_precision_stderr": 0.0011852493101770921, "rouge2_recall": 0.021659652341736273, "rouge2_recall_stderr": 0.0011776028400838929, "rougeL_fmeasure": 0.04448068031172814, "rougeL_fmeasure_stderr": 0.0015668077231183733, "rougeL_precision": 0.05090308576277237, "rougeL_precision_stderr": 0.002121767282678217, "rougeL_recall": 0.05971715204113024, "rougeL_recall_stderr": 0.002201135080103992, "rougeLsum_fmeasure": 0.055493537558428035, "rougeLsum_fmeasure_stderr": 0.001944192579335498, "rougeLsum_precision": 0.06266723632734954, "rougeLsum_precision_stderr": 0.002537275263785622, "rougeLsum_recall": 0.07362961134314913, "rougeLsum_recall_stderr": 0.0026439669964067034}}, "5": {"tldr_en": {"bleu": 8.62147070684518e-08, "bleu_stderr": 2.4998412186774315e-07, "rouge1_fmeasure": 0.009019405424799239, "rouge1_fmeasure_stderr": 0.0008796953070286968, "rouge1_precision": 0.009347302370204104, "rouge1_precision_stderr": 0.0009939432842913038, "rouge1_recall": 0.012675375103590819, "rouge1_recall_stderr": 0.0012741296212300682, "rouge2_fmeasure": 0.002459973367560091, "rouge2_fmeasure_stderr": 0.0003520874137724776, "rouge2_precision": 0.002520125545331128, "rouge2_precision_stderr": 0.00043890835709493897, "rouge2_recall": 0.0034745962063106794, "rouge2_recall_stderr": 0.0004926837590871648, "rougeL_fmeasure": 0.006955043299953472, "rougeL_fmeasure_stderr": 0.0006891726751168752, "rougeL_precision": 0.007379431863883617, "rougeL_precision_stderr": 0.0008249510021821957, "rougeL_recall": 0.009772980431375072, "rougeL_recall_stderr": 0.0009978936334822926, "rougeLsum_fmeasure": 0.008527269899079934, "rougeLsum_fmeasure_stderr": 0.0008337482953004597, "rougeLsum_precision": 0.008880786469979814, "rougeLsum_precision_stderr": 0.0009541098770127502, "rougeLsum_recall": 0.011981285756158963, "rougeLsum_recall_stderr": 0.0012073677632838248}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 2.5190605487649087, "bleu_stderr": 0.14578061564278902, "rouge1_fmeasure": 0.08785204369989494, "rouge1_fmeasure_stderr": 0.001909841985405621, "rouge1_precision": 0.12015610790826121, "rouge1_precision_stderr": 0.0026013907575401264, "rouge1_recall": 0.1007142822326354, "rouge1_recall_stderr": 0.0025699976738232087, "rouge2_fmeasure": 0.020631873410295806, "rouge2_fmeasure_stderr": 0.0008986868448192905, "rouge2_precision": 0.021978597704618782, "rouge2_precision_stderr": 0.0011061787510217053, "rouge2_recall": 0.026856846878022297, "rouge2_recall_stderr": 0.0012249926502431994, "rougeL_fmeasure": 0.083870708743602, "rougeL_fmeasure_stderr": 0.0017764834490443292, "rougeL_precision": 0.11432207717597319, "rougeL_precision_stderr": 0.002375152014750786, "rougeL_recall": 0.09673002538661275, "rougeL_recall_stderr": 0.0024476967854968764, "rougeLsum_fmeasure": 0.0771386142988915, "rougeLsum_fmeasure_stderr": 0.0016644247241633418, "rougeLsum_precision": 0.10920558363229785, "rougeLsum_precision_stderr": 0.0023563322630614446, "rougeLsum_recall": 0.08646567425286227, "rougeLsum_recall_stderr": 0.0022239273238183668}}, "1": {"generate_text_restaurant": {"bleu": 11.471305506037798, "bleu_stderr": 0.07875003513092577, "rouge1_fmeasure": 0.43708472576518986, "rouge1_fmeasure_stderr": 0.00224171007536442, "rouge1_precision": 0.5170379371586666, "rouge1_precision_stderr": 0.0033287522955400954, "rouge1_recall": 0.4239211672299534, "rouge1_recall_stderr": 0.002947218737469124, "rouge2_fmeasure": 0.19998119743920914, "rouge2_fmeasure_stderr": 0.0019048339787913725, "rouge2_precision": 0.24177763061210847, "rouge2_precision_stderr": 0.0026337382842525014, "rouge2_recall": 0.1931046855790191, "rouge2_recall_stderr": 0.0020767023449716755, "rougeL_fmeasure": 0.31725506977900303, "rougeL_fmeasure_stderr": 0.0019449957706219145, "rougeL_precision": 0.3792838934227107, "rougeL_precision_stderr": 0.002982844793468976, "rougeL_recall": 0.3062355213393646, "rougeL_recall_stderr": 0.0023427585826778815, "rougeLsum_fmeasure": 0.3599294935860651, "rougeLsum_fmeasure_stderr": 0.0021823306634720274, "rougeLsum_precision": 0.42712804511272506, "rougeLsum_precision_stderr": 0.003164747958067866, "rougeLsum_recall": 0.3486090540172501, "rougeLsum_recall_stderr": 0.002679539945997737}}, "2": {"generate_text_restaurant": {"bleu": 13.980968022690044, "bleu_stderr": 0.21821573679070036, "rouge1_fmeasure": 0.47004796009089583, "rouge1_fmeasure_stderr": 0.0022258968023120985, "rouge1_precision": 0.5445902709342957, "rouge1_precision_stderr": 0.00337974740434529, "rouge1_recall": 0.458944313155064, "rouge1_recall_stderr": 0.0029017981497803163, "rouge2_fmeasure": 0.23012493273504447, "rouge2_fmeasure_stderr": 0.002033859795223151, "rouge2_precision": 0.2713453392654981, "rouge2_precision_stderr": 0.002780199873202826, "rouge2_recall": 0.2245350505416365, "rouge2_recall_stderr": 0.0022393722057462194, "rougeL_fmeasure": 0.34524196268248847, "rougeL_fmeasure_stderr": 0.0020571216808165926, "rougeL_precision": 0.4018381948627221, "rougeL_precision_stderr": 0.0030321901639654486, "rougeL_recall": 0.33679603771757866, "rougeL_recall_stderr": 0.002478135715528108, "rougeLsum_fmeasure": 0.39203827558400783, "rougeLsum_fmeasure_stderr": 0.0022732280818521912, "rougeLsum_precision": 0.45423494017342136, "rougeLsum_precision_stderr": 0.003233814196249404, "rougeLsum_recall": 0.3829304393507027, "rougeLsum_recall_stderr": 0.0027623882797974796}}, "3": {"generate_text_restaurant": {"bleu": 14.154980307761505, "bleu_stderr": 0.1448658074579454, "rouge1_fmeasure": 0.4705643512917158, "rouge1_fmeasure_stderr": 0.00214940659696796, "rouge1_precision": 0.5290233700279933, "rouge1_precision_stderr": 0.003286733532241089, "rouge1_recall": 0.4695155836113227, "rouge1_recall_stderr": 0.0028589858065198362, "rouge2_fmeasure": 0.23283631982568367, "rouge2_fmeasure_stderr": 0.0019771157251113228, "rouge2_precision": 0.265538771393177, "rouge2_precision_stderr": 0.0026553555632510255, "rouge2_recall": 0.23270443394551238, "rouge2_recall_stderr": 0.0022448787273333173, "rougeL_fmeasure": 0.3471376353301056, "rougeL_fmeasure_stderr": 0.0020102129845005895, "rougeL_precision": 0.39229315419704086, "rougeL_precision_stderr": 0.0029631232557498915, "rougeL_recall": 0.3458978765243631, "rougeL_recall_stderr": 0.0024472277771961978, "rougeLsum_fmeasure": 0.3954525442177938, "rougeLsum_fmeasure_stderr": 0.0022179179708118656, "rougeLsum_precision": 0.44487893090959907, "rougeLsum_precision_stderr": 0.003164337474810978, "rougeLsum_recall": 0.39451585686809876, "rougeLsum_recall_stderr": 0.002730627600611841}}, "4": {"generate_text_restaurant": {"bleu": 13.721038666752582, "bleu_stderr": 0.17683442815246456, "rouge1_fmeasure": 0.4691628743032071, "rouge1_fmeasure_stderr": 0.002144459294602491, "rouge1_precision": 0.5089881039212353, "rouge1_precision_stderr": 0.0031941598167466606, "rouge1_recall": 0.479032657967036, "rouge1_recall_stderr": 0.002765675705775823, "rouge2_fmeasure": 0.23219088772710936, "rouge2_fmeasure_stderr": 0.001982880603357365, "rouge2_precision": 0.2551097554402564, "rouge2_precision_stderr": 0.002576496565725004, "rouge2_recall": 0.23713242535420967, "rouge2_recall_stderr": 0.00222316717149407, "rougeL_fmeasure": 0.3437342989792434, "rougeL_fmeasure_stderr": 0.0019737589675455874, "rougeL_precision": 0.37390882033029804, "rougeL_precision_stderr": 0.0028070940255931595, "rougeL_recall": 0.3512215949214737, "rougeL_recall_stderr": 0.0024022657027433863, "rougeLsum_fmeasure": 0.39438992601573425, "rougeLsum_fmeasure_stderr": 0.0022083897072610196, "rougeLsum_precision": 0.4277340821604659, "rougeLsum_precision_stderr": 0.003042188683426346, "rougeLsum_recall": 0.40285522466618634, "rougeLsum_recall_stderr": 0.0026869688850980986}}, "5": {"generate_text_restaurant": {"bleu": 13.685692148533645, "bleu_stderr": 0.18475778017537345, "rouge1_fmeasure": 0.4719066444266488, "rouge1_fmeasure_stderr": 0.0021232213144874593, "rouge1_precision": 0.508438587476658, "rouge1_precision_stderr": 0.0030967236462497586, "rouge1_recall": 0.4814718095446795, "rouge1_recall_stderr": 0.002740769966051787, "rouge2_fmeasure": 0.2335596273503415, "rouge2_fmeasure_stderr": 0.0019811310308482434, "rouge2_precision": 0.2543070392775442, "rouge2_precision_stderr": 0.0025002270899075695, "rouge2_recall": 0.2386513703857796, "rouge2_recall_stderr": 0.0022344362770068673, "rougeL_fmeasure": 0.34320081095568616, "rougeL_fmeasure_stderr": 0.001977144393265691, "rougeL_precision": 0.3699950720656377, "rougeL_precision_stderr": 0.0026651187832523664, "rougeL_recall": 0.35074320852708746, "rougeL_recall_stderr": 0.002429104142195895, "rougeLsum_fmeasure": 0.3963766746924401, "rougeLsum_fmeasure_stderr": 0.00219952367965179, "rougeLsum_precision": 0.427080652482253, "rougeLsum_precision_stderr": 0.002971111279966823, "rougeLsum_recall": 0.40443782365098674, "rougeLsum_recall_stderr": 0.0026631012930175323}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.160170846637464, "bleu_stderr": 0.09889873775096984, "rouge1_fmeasure": 0.2155668468544653, "rouge1_fmeasure_stderr": 0.0026646559073492822, "rouge1_precision": 0.1675664806545814, "rouge1_precision_stderr": 0.002452603521749886, "rouge1_recall": 0.34303659164580813, "rouge1_recall_stderr": 0.00451220621305511, "rouge2_fmeasure": 0.05045080374949566, "rouge2_fmeasure_stderr": 0.0017576960885745984, "rouge2_precision": 0.03864946281782141, "rouge2_precision_stderr": 0.0014769194957715624, "rouge2_recall": 0.08335808413265051, "rouge2_recall_stderr": 0.002908314864654091, "rougeL_fmeasure": 0.1635478469839484, "rougeL_fmeasure_stderr": 0.0021077578920492917, "rougeL_precision": 0.12702750275933555, "rougeL_precision_stderr": 0.001968823066408308, "rougeL_recall": 0.26154017483908965, "rougeL_recall_stderr": 0.003585895961808971, "rougeLsum_fmeasure": 0.16836751227813998, "rougeLsum_fmeasure_stderr": 0.0022998758409405514, "rougeLsum_precision": 0.13051144332858214, "rougeLsum_precision_stderr": 0.0020713501923767027, "rougeLsum_recall": 0.26981706823174656, "rougeLsum_recall_stderr": 0.003957679243103724}}, "1": {"article_DOC_summary": {"bleu": 1.8701953699144762, "bleu_stderr": 0.05457096063405516, "rouge1_fmeasure": 0.1898050764365515, "rouge1_fmeasure_stderr": 0.0025895313777244354, "rouge1_precision": 0.13520973640137626, "rouge1_precision_stderr": 0.001930299351096568, "rouge1_recall": 0.3323577264719374, "rouge1_recall_stderr": 0.004491568598510519, "rouge2_fmeasure": 0.04465858992847766, "rouge2_fmeasure_stderr": 0.0015759770915751918, "rouge2_precision": 0.03140636068492149, "rouge2_precision_stderr": 0.001113870373266841, "rouge2_recall": 0.08109440867971819, "rouge2_recall_stderr": 0.0029471809124174825, "rougeL_fmeasure": 0.14968873061397606, "rougeL_fmeasure_stderr": 0.001933469116399493, "rougeL_precision": 0.10636283963297354, "rougeL_precision_stderr": 0.0014233454751993193, "rougeL_recall": 0.26423746064810627, "rougeL_recall_stderr": 0.003579439421960654, "rougeLsum_fmeasure": 0.14993016736293346, "rougeLsum_fmeasure_stderr": 0.0021358853384488446, "rougeLsum_precision": 0.10653719114272985, "rougeLsum_precision_stderr": 0.0015634601753452939, "rougeLsum_recall": 0.26451619174803076, "rougeLsum_recall_stderr": 0.003902068769045215}}, "2": {"article_DOC_summary": {"bleu": 1.8292996100546781, "bleu_stderr": 0.08163094513619551, "rouge1_fmeasure": 0.1913140581117672, "rouge1_fmeasure_stderr": 0.0025051242733995895, "rouge1_precision": 0.1369596940677527, "rouge1_precision_stderr": 0.0018924121735394107, "rouge1_recall": 0.33267847518896543, "rouge1_recall_stderr": 0.004382896639279902, "rouge2_fmeasure": 0.046514407795467595, "rouge2_fmeasure_stderr": 0.0015752063358252097, "rouge2_precision": 0.03283865595587313, "rouge2_precision_stderr": 0.0011141105617716379, "rouge2_recall": 0.08404626501275646, "rouge2_recall_stderr": 0.0029766014928721417, "rougeL_fmeasure": 0.1555172607724421, "rougeL_fmeasure_stderr": 0.0019665304832963766, "rougeL_precision": 0.11105944816976555, "rougeL_precision_stderr": 0.0014582290975397828, "rougeL_recall": 0.27226184993618935, "rougeL_recall_stderr": 0.003638521351468343, "rougeLsum_fmeasure": 0.148021529852108, "rougeLsum_fmeasure_stderr": 0.0021093930620772303, "rougeLsum_precision": 0.10559932998246374, "rougeLsum_precision_stderr": 0.0015493840715498106, "rougeLsum_recall": 0.2597115453851503, "rougeLsum_recall_stderr": 0.003887704936356228}}, "3": {"article_DOC_summary": {"bleu": 1.729714249041032, "bleu_stderr": 0.09968587223572584, "rouge1_fmeasure": 0.1828325966117306, "rouge1_fmeasure_stderr": 0.0026884744365811403, "rouge1_precision": 0.13421303891622097, "rouge1_precision_stderr": 0.0021412802231387967, "rouge1_recall": 0.310980261970668, "rouge1_recall_stderr": 0.004684135200277263, "rouge2_fmeasure": 0.043981143624385655, "rouge2_fmeasure_stderr": 0.0015389553455476062, "rouge2_precision": 0.03176375904264973, "rouge2_precision_stderr": 0.0011285744197009628, "rouge2_recall": 0.0771291588842163, "rouge2_recall_stderr": 0.0028009360577460445, "rougeL_fmeasure": 0.15042887440453737, "rougeL_fmeasure_stderr": 0.002182144987791837, "rougeL_precision": 0.11010971775440595, "rougeL_precision_stderr": 0.0017060839433245796, "rougeL_recall": 0.257265478142769, "rougeL_recall_stderr": 0.003940518987777604, "rougeLsum_fmeasure": 0.14143730410765654, "rougeLsum_fmeasure_stderr": 0.0022370123860963476, "rougeLsum_precision": 0.10364375570761218, "rougeLsum_precision_stderr": 0.0017533516458810485, "rougeLsum_recall": 0.24201646695848927, "rougeLsum_recall_stderr": 0.004005217279812412}}, "4": {"article_DOC_summary": {"bleu": 0.9890184534659001, "bleu_stderr": 0.14163039170125274, "rouge1_fmeasure": 0.0521003876015691, "rouge1_fmeasure_stderr": 0.002910995728301919, "rouge1_precision": 0.04440086644350133, "rouge1_precision_stderr": 0.0028076146832161425, "rouge1_recall": 0.0803342918035357, "rouge1_recall_stderr": 0.004572536216376621, "rouge2_fmeasure": 0.01307647569779615, "rouge2_fmeasure_stderr": 0.0011218803952754701, "rouge2_precision": 0.011721386490746217, "rouge2_precision_stderr": 0.001394908515617324, "rouge2_recall": 0.020621720002167342, "rouge2_recall_stderr": 0.0017793611110912008, "rougeL_fmeasure": 0.04296796327738306, "rougeL_fmeasure_stderr": 0.002392166194141485, "rougeL_precision": 0.03669834695042482, "rougeL_precision_stderr": 0.0023530729136330853, "rougeL_recall": 0.06682107529885967, "rougeL_recall_stderr": 0.0038307757001802474, "rougeLsum_fmeasure": 0.042028608773829565, "rougeLsum_fmeasure_stderr": 0.0023859608992589927, "rougeLsum_precision": 0.03617640134116618, "rougeLsum_precision_stderr": 0.0023748208613590294, "rougeLsum_recall": 0.06502829732266231, "rougeLsum_recall_stderr": 0.003786163884263381}}, "5": {"article_DOC_summary": {"bleu": 1.3300633595706627e-37, "bleu_stderr": 2.704529748702706e-32, "rouge1_fmeasure": 0.002975946933660433, "rouge1_fmeasure_stderr": 0.0008325096950346747, "rouge1_precision": 0.003536517824005411, "rouge1_precision_stderr": 0.0010348361693354221, "rouge1_recall": 0.0026502867716370814, "rouge1_recall_stderr": 0.0007219200372674768, "rouge2_fmeasure": 0.0005678964868499288, "rouge2_fmeasure_stderr": 0.00022071683514847003, "rouge2_precision": 0.0007216185282223018, "rouge2_precision_stderr": 0.00028726019405366274, "rouge2_recall": 0.00047841047841047846, "rouge2_recall_stderr": 0.00018648358251820024, "rougeL_fmeasure": 0.002335136000302258, "rougeL_fmeasure_stderr": 0.000650477644840957, "rougeL_precision": 0.0027483075773717747, "rougeL_precision_stderr": 0.0008030756319798513, "rougeL_recall": 0.0021047616766507017, "rougeL_recall_stderr": 0.0005731278953348982, "rougeLsum_fmeasure": 0.0024473347701190456, "rougeLsum_fmeasure_stderr": 0.0006984333555662036, "rougeLsum_precision": 0.0029063810983753734, "rougeLsum_precision_stderr": 0.0008739925650642316, "rougeLsum_recall": 0.0021888663476128875, "rougeLsum_recall_stderr": 0.0006063079941836999}}}} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..2f19c3161e8dbbdc938cf0a3e02199351d7ff33d --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932575,0 +anli_r2,acc,0.333,0.014910846164229857,0 +anli_r3,acc,0.3416666666666667,0.013696658778002512,0 +arc_challenge,acc,0.26791808873720135,0.012942030195136432,0 +arc_challenge,acc_norm,0.2909556313993174,0.013273077865907581,0 +arc_easy,acc,0.61489898989899,0.009985214798737247,0 +arc_easy,acc_norm,0.5349326599326599,0.010234713052723684,0 +boolq,acc,0.5804281345565749,0.008631175489166722,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.1940928270042194,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.48207528380800635,0.004986573992451681,0 +hellaswag,acc_norm,0.6312487552280422,0.004814803098436799,0 +piqa,acc,0.7616974972796517,0.009940334245876203,0 +piqa,acc_norm,0.7665941240478781,0.009869247889520993,0 +rte,acc,0.5451263537906137,0.029973636495415252,0 +sciq,acc,0.833,0.011800434324644586,0 +sciq,acc_norm,0.747,0.01375427861358708,0 +storycloze_2016,acc,0.7252805986103688,0.010322309878339504,0 +winogrande,acc,0.5832675611681136,0.013856250072796318,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json deleted file mode 100644 index 950b0d0a3c5c0a7e31a1aeb1e46d770fc3673ba5..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932575 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229857 - }, - "anli_r3": { - "acc": 0.3416666666666667, - "acc_stderr": 0.013696658778002512 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.1940928270042194 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.48207528380800635, - "acc_stderr": 0.004986573992451681, - "acc_norm": 0.6312487552280422, - "acc_norm_stderr": 0.004814803098436799 - }, - "rte": { - "acc": 0.5451263537906137, - "acc_stderr": 0.029973636495415252 - }, - "winogrande": { - "acc": 0.5832675611681136, - "acc_stderr": 0.013856250072796318 - }, - "storycloze_2016": { - "acc": 0.7252805986103688, - "acc_stderr": 0.010322309878339504 - }, - "boolq": { - "acc": 0.5804281345565749, - "acc_stderr": 0.008631175489166722 - }, - "arc_easy": { - "acc": 0.61489898989899, - "acc_stderr": 0.009985214798737247, - "acc_norm": 0.5349326599326599, - "acc_norm_stderr": 0.010234713052723684 - }, - "arc_challenge": { - "acc": 0.26791808873720135, - "acc_stderr": 0.012942030195136432, - "acc_norm": 0.2909556313993174, - "acc_norm_stderr": 0.013273077865907581 - }, - "sciq": { - "acc": 0.833, - "acc_stderr": 0.011800434324644586, - "acc_norm": 0.747, - "acc_norm_stderr": 0.01375427861358708 - }, - "piqa": { - "acc": 0.7616974972796517, - "acc_stderr": 0.009940334245876203, - "acc_norm": 0.7665941240478781, - "acc_norm_stderr": 0.009869247889520993 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..87fa591b48bb5d329ac49475d02a45102b5322e4 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732963,0 +anli_r2,acc,0.333,0.014910846164229868,0 +anli_r3,acc,0.3491666666666667,0.013767075395077247,0 +arc_challenge,acc,0.29266211604095566,0.013295916103619411,0 +arc_challenge,acc_norm,0.3225255972696246,0.01365998089427737,0 +arc_easy,acc,0.6212121212121212,0.009953737656542035,0 +arc_easy,acc_norm,0.5833333333333334,0.010116282977781254,0 +boolq,acc,0.599388379204893,0.008570545612096374,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.23179160021265285,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.48088030272854015,0.004986131919673967,0 +hellaswag,acc_norm,0.630053774148576,0.004818031396138917,0 +piqa,acc,0.7529923830250272,0.01006226814077262,0 +piqa,acc_norm,0.7627856365614799,0.009924694933586374,0 +rte,acc,0.5667870036101083,0.029826764082138277,0 +sciq,acc,0.887,0.010016552866696848,0 +sciq,acc_norm,0.876,0.01042749887234396,0 +storycloze_2016,acc,0.7204703367183325,0.01037770209970486,0 +winogrande,acc,0.5951065509076559,0.013795927003124939,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json deleted file mode 100644 index 74d9db589a2a1dc742efbd3589ba51e1e8b6d38f..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732963 - }, - "anli_r2": { - "acc": 0.333, - "acc_stderr": 0.014910846164229868 - }, - "anli_r3": { - "acc": 0.3491666666666667, - "acc_stderr": 0.013767075395077247 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.23179160021265285 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.48088030272854015, - "acc_stderr": 0.004986131919673967, - "acc_norm": 0.630053774148576, - "acc_norm_stderr": 0.004818031396138917 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138277 - }, - "winogrande": { - "acc": 0.5951065509076559, - "acc_stderr": 0.013795927003124939 - }, - "storycloze_2016": { - "acc": 0.7204703367183325, - "acc_stderr": 0.01037770209970486 - }, - "boolq": { - "acc": 0.599388379204893, - "acc_stderr": 0.008570545612096374 - }, - "arc_easy": { - "acc": 0.6212121212121212, - "acc_stderr": 0.009953737656542035, - "acc_norm": 0.5833333333333334, - "acc_norm_stderr": 0.010116282977781254 - }, - "arc_challenge": { - "acc": 0.29266211604095566, - "acc_stderr": 0.013295916103619411, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.01365998089427737 - }, - "sciq": { - "acc": 0.887, - "acc_stderr": 0.010016552866696848, - "acc_norm": 0.876, - "acc_norm_stderr": 0.01042749887234396 - }, - "piqa": { - "acc": 0.7529923830250272, - "acc_stderr": 0.01006226814077262, - "acc_norm": 0.7627856365614799, - "acc_norm_stderr": 0.009924694933586374 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..476192f4b4246e5ff8f1a44e2735cc9f6e901155 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.335,0.014933117490932573,0 +anli_r2,acc,0.334,0.014922019523732954,0 +anli_r3,acc,0.3425,0.013704669762934727,0 +arc_challenge,acc,0.2935153583617747,0.01330725044494113,0 +arc_challenge,acc_norm,0.32764505119453924,0.013715847940719346,0 +arc_easy,acc,0.6224747474747475,0.009947227833469432,0 +arc_easy,acc_norm,0.6047979797979798,0.010031894052790978,0 +boolq,acc,0.6048929663608563,0.0085504542482809,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.258008658008658,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.4801832304321848,0.0049858608534276315,0 +hellaswag,acc_norm,0.6342362079267079,0.004806593424942264,0 +piqa,acc,0.7546245919477693,0.010039831320422401,0 +piqa,acc_norm,0.7622415669205659,0.009932525779525492,0 +rte,acc,0.5703971119133574,0.02979666882912467,0 +sciq,acc,0.913,0.008916866630745908,0 +sciq,acc_norm,0.897,0.009616833339695798,0 +storycloze_2016,acc,0.72367717797969,0.010340939873166822,0 +winogrande,acc,0.6029992107340174,0.013751092519806702,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json deleted file mode 100644 index d360b8ff3ed404cd5ab315dffa40652568ddfe03..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.335, - "acc_stderr": 0.014933117490932573 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732954 - }, - "anli_r3": { - "acc": 0.3425, - "acc_stderr": 0.013704669762934727 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.258008658008658 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.4801832304321848, - "acc_stderr": 0.0049858608534276315, - "acc_norm": 0.6342362079267079, - "acc_norm_stderr": 0.004806593424942264 - }, - "rte": { - "acc": 0.5703971119133574, - "acc_stderr": 0.02979666882912467 - }, - "winogrande": { - "acc": 0.6029992107340174, - "acc_stderr": 0.013751092519806702 - }, - "storycloze_2016": { - "acc": 0.72367717797969, - "acc_stderr": 0.010340939873166822 - }, - "boolq": { - "acc": 0.6048929663608563, - "acc_stderr": 0.0085504542482809 - }, - "arc_easy": { - "acc": 0.6224747474747475, - "acc_stderr": 0.009947227833469432, - "acc_norm": 0.6047979797979798, - "acc_norm_stderr": 0.010031894052790978 - }, - "arc_challenge": { - "acc": 0.2935153583617747, - "acc_stderr": 0.01330725044494113, - "acc_norm": 0.32764505119453924, - "acc_norm_stderr": 0.013715847940719346 - }, - "sciq": { - "acc": 0.913, - "acc_stderr": 0.008916866630745908, - "acc_norm": 0.897, - "acc_norm_stderr": 0.009616833339695798 - }, - "piqa": { - "acc": 0.7546245919477693, - "acc_stderr": 0.010039831320422401, - "acc_norm": 0.7622415669205659, - "acc_norm_stderr": 0.009932525779525492 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..00d010a774ab4d6f64444a2c1f86c46622a969c7 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.319,0.014746404865473472,0 +anli_r2,acc,0.341,0.014998131348402704,0 +anli_r3,acc,0.3566666666666667,0.013833742805050722,0 +arc_challenge,acc,0.2986348122866894,0.013374078615068756,0 +arc_challenge,acc_norm,0.3225255972696246,0.013659980894277376,0 +arc_easy,acc,0.6292087542087542,0.009911292822056923,0 +arc_easy,acc_norm,0.6136363636363636,0.009991296778159617,0 +boolq,acc,0.6162079510703364,0.008505584729104983,1 +cb,acc,0.39285714285714285,0.0658538889806635,1 +cb,f1,0.224400871459695,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.4819757020513842,0.004986538243846636,0 +hellaswag,acc_norm,0.6331408086038638,0.004809626723626843,0 +piqa,acc,0.7557127312295974,0.010024765172284242,0 +piqa,acc_norm,0.7682263329706203,0.009845143772794046,0 +rte,acc,0.5631768953068592,0.029855247390314945,0 +sciq,acc,0.911,0.009008893392651526,0 +sciq,acc_norm,0.894,0.00973955126578513,0 +storycloze_2016,acc,0.7279529663281668,0.010290888060871242,0 +winogrande,acc,0.6045777426992897,0.013741678387545347,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json deleted file mode 100644 index 979c2816027792946efcf70312b1d8dc65be9fe4..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.319, - "acc_stderr": 0.014746404865473472 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.014998131348402704 - }, - "anli_r3": { - "acc": 0.3566666666666667, - "acc_stderr": 0.013833742805050722 - }, - "cb": { - "acc": 0.39285714285714285, - "acc_stderr": 0.0658538889806635, - "f1": 0.224400871459695 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.4819757020513842, - "acc_stderr": 0.004986538243846636, - "acc_norm": 0.6331408086038638, - "acc_norm_stderr": 0.004809626723626843 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.029855247390314945 - }, - "winogrande": { - "acc": 0.6045777426992897, - "acc_stderr": 0.013741678387545347 - }, - "storycloze_2016": { - "acc": 0.7279529663281668, - "acc_stderr": 0.010290888060871242 - }, - "boolq": { - "acc": 0.6162079510703364, - "acc_stderr": 0.008505584729104983 - }, - "arc_easy": { - "acc": 0.6292087542087542, - "acc_stderr": 0.009911292822056923, - "acc_norm": 0.6136363636363636, - "acc_norm_stderr": 0.009991296778159617 - }, - "arc_challenge": { - "acc": 0.2986348122866894, - "acc_stderr": 0.013374078615068756, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.013659980894277376 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651526, - "acc_norm": 0.894, - "acc_norm_stderr": 0.00973955126578513 - }, - "piqa": { - "acc": 0.7557127312295974, - "acc_stderr": 0.010024765172284242, - "acc_norm": 0.7682263329706203, - "acc_norm_stderr": 0.009845143772794046 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..07b750451c0ee89628e3dd9c6d5866b1f33b79ee --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.32,0.014758652303574888,0 +anli_r2,acc,0.341,0.0149981313484027,0 +anli_r3,acc,0.3308333333333333,0.013588208070708992,0 +arc_challenge,acc,0.3054607508532423,0.013460080478002496,0 +arc_challenge,acc_norm,0.33447098976109213,0.013787460322441375,0 +arc_easy,acc,0.640993265993266,0.009843424713072176,0 +arc_easy,acc_norm,0.6233164983164983,0.009942848077476165,0 +boolq,acc,0.6226299694189602,0.008477957863309996,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.18571428571428572,,1 +copa,acc,0.84,0.0368452949177471,0 +hellaswag,acc,0.4788886675960964,0.004985331652408344,0 +hellaswag,acc_norm,0.6372236606253734,0.0047981844631563575,0 +piqa,acc,0.7546245919477693,0.010039831320422398,0 +piqa,acc_norm,0.7709466811751904,0.009804509865175505,0 +rte,acc,0.51985559566787,0.030072723167317177,0 +sciq,acc,0.915,0.00882342636694232,0 +sciq,acc_norm,0.906,0.009233052000787736,0 +storycloze_2016,acc,0.7300908605024051,0.010265413503221462,0 +winogrande,acc,0.6227308602999211,0.013622567928799501,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json deleted file mode 100644 index b2fd5c0961d0ed108c95085b66899c7342303c77..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.32, - "acc_stderr": 0.014758652303574888 - }, - "anli_r2": { - "acc": 0.341, - "acc_stderr": 0.0149981313484027 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070708992 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.18571428571428572 - }, - "copa": { - "acc": 0.84, - "acc_stderr": 0.0368452949177471 - }, - "hellaswag": { - "acc": 0.4788886675960964, - "acc_stderr": 0.004985331652408344, - "acc_norm": 0.6372236606253734, - "acc_norm_stderr": 0.0047981844631563575 - }, - "rte": { - "acc": 0.51985559566787, - "acc_stderr": 0.030072723167317177 - }, - "winogrande": { - "acc": 0.6227308602999211, - "acc_stderr": 0.013622567928799501 - }, - "storycloze_2016": { - "acc": 0.7300908605024051, - "acc_stderr": 0.010265413503221462 - }, - "boolq": { - "acc": 0.6226299694189602, - "acc_stderr": 0.008477957863309996 - }, - "arc_easy": { - "acc": 0.640993265993266, - "acc_stderr": 0.009843424713072176, - "acc_norm": 0.6233164983164983, - "acc_norm_stderr": 0.009942848077476165 - }, - "arc_challenge": { - "acc": 0.3054607508532423, - "acc_stderr": 0.013460080478002496, - "acc_norm": 0.33447098976109213, - "acc_norm_stderr": 0.013787460322441375 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.00882342636694232, - "acc_norm": 0.906, - "acc_norm_stderr": 0.009233052000787736 - }, - "piqa": { - "acc": 0.7546245919477693, - "acc_stderr": 0.010039831320422398, - "acc_norm": 0.7709466811751904, - "acc_norm_stderr": 0.009804509865175505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5.csv b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..706ced2af7d715e17856d2035ee5636d5c014f39 --- /dev/null +++ b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363935,0 +anli_r2,acc,0.347,0.015060472031706617,0 +anli_r3,acc,0.3325,0.01360541734571053,0 +arc_challenge,acc,0.30119453924914674,0.01340674176784762,0 +arc_challenge,acc_norm,0.32337883959044367,0.013669421630012123,0 +arc_easy,acc,0.6477272727272727,0.009801753933112778,0 +arc_easy,acc_norm,0.6199494949494949,0.009960175831493124,0 +boolq,acc,0.6214067278287462,0.00848334171802448,1 +cb,acc,0.32142857142857145,0.06297362289056341,1 +cb,f1,0.1621621621621622,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.48008364867556264,0.0049858213361464,0 +hellaswag,acc_norm,0.6368253335988847,0.00479931720990201,0 +piqa,acc,0.7524483133841132,0.01006970396685711,0 +piqa,acc_norm,0.7747551686615887,0.009746643471032136,0 +rte,acc,0.5595667870036101,0.029882123363118716,0 +sciq,acc,0.918,0.008680515615523722,0 +sciq,acc_norm,0.914,0.008870325962594766,0 +storycloze_2016,acc,0.7279529663281668,0.010290888060871242,0 +winogrande,acc,0.6179952644041041,0.013655578215970424,0 diff --git a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json b/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json deleted file mode 100644 index dcef3076cb7d921944330ebdc2457b0e479bd084..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed2/evaluation/rankeval/4b284b84bc4v2seed2_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363935 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706617 - }, - "anli_r3": { - "acc": 0.3325, - "acc_stderr": 0.01360541734571053 - }, - "cb": { - "acc": 0.32142857142857145, - "acc_stderr": 0.06297362289056341, - "f1": 0.1621621621621622 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.48008364867556264, - "acc_stderr": 0.0049858213361464, - "acc_norm": 0.6368253335988847, - "acc_norm_stderr": 0.00479931720990201 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.029882123363118716 - }, - "winogrande": { - "acc": 0.6179952644041041, - "acc_stderr": 0.013655578215970424 - }, - "storycloze_2016": { - "acc": 0.7279529663281668, - "acc_stderr": 0.010290888060871242 - }, - "boolq": { - "acc": 0.6214067278287462, - "acc_stderr": 0.00848334171802448 - }, - "arc_easy": { - "acc": 0.6477272727272727, - "acc_stderr": 0.009801753933112778, - "acc_norm": 0.6199494949494949, - "acc_norm_stderr": 0.009960175831493124 - }, - "arc_challenge": { - "acc": 0.30119453924914674, - "acc_stderr": 0.01340674176784762, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012123 - }, - "sciq": { - "acc": 0.918, - "acc_stderr": 0.008680515615523722, - "acc_norm": 0.914, - "acc_norm_stderr": 0.008870325962594766 - }, - "piqa": { - "acc": 0.7524483133841132, - "acc_stderr": 0.01006970396685711, - "acc_norm": 0.7747551686615887, - "acc_norm_stderr": 0.009746643471032136 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/generation/merged.csv b/4b284b84bc4v2seed3/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..d2230a417ccd6b6073328d256e36f43fb7a4102a --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0045379311452906496 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0045379311452906496 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20464247864479315 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.20464247864479315 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22811726769856144 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22811726769856144 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23831207673207944 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23831207673207944 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23895749463268698 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23895749463268698 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24237216469601944 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24237216469601944 +e2e_nlg_cleaned,5,average,multiple,0.19282323559157186 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04856213249829562 +gem_xsum,0,median,rouge2_fmeasure,0.04856213249829562 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03694944789541673 +gem_xsum,1,median,rouge2_fmeasure,0.03694944789541673 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.03989446165864525 +gem_xsum,2,median,rouge2_fmeasure,0.03989446165864525 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.040350616115479146 +gem_xsum,3,median,rouge2_fmeasure,0.040350616115479146 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009949994948206649 +gem_xsum,4,median,rouge2_fmeasure,0.009949994948206649 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00021508550838189274 +gem_xsum,5,median,rouge2_fmeasure,0.00021508550838189274 +gem_xsum,5,average,multiple,0.02932028977073755 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04638572157936821 +web_nlg_en,0,median,rouge2_fmeasure,0.04638572157936821 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.052828679189574415 +web_nlg_en,1,median,rouge2_fmeasure,0.052828679189574415 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05449451670049787 +web_nlg_en,2,median,rouge2_fmeasure,0.05449451670049787 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05302505138393545 +web_nlg_en,3,median,rouge2_fmeasure,0.05302505138393545 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05494234282461018 +web_nlg_en,4,median,rouge2_fmeasure,0.05494234282461018 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05660554273746949 +web_nlg_en,5,median,rouge2_fmeasure,0.05660554273746949 +web_nlg_en,5,average,multiple,0.05304697573590927 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.035914103574269995 +wiki_lingua_en,0,median,rouge2_fmeasure,0.035914103574269995 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.056001917384198796 +wiki_lingua_en,1,median,rouge2_fmeasure,0.056001917384198796 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05753113933914173 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05753113933914173 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04592788875910395 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04592788875910395 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015593229425275027 +wiki_lingua_en,4,median,rouge2_fmeasure,0.015593229425275027 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002384610836596322 +wiki_lingua_en,5,median,rouge2_fmeasure,0.002384610836596322 +wiki_lingua_en,5,average,multiple,0.03555881488643097 diff --git a/4b284b84bc4v2seed3/evaluation/generation/merged.json b/4b284b84bc4v2seed3/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..5cee214f997eb205937f060cbae772fbc9c03af2 --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.33698931558059864, "bleu_stderr": 0.03520091082802491, "rouge1_fmeasure": 0.09682464818590401, "rouge1_fmeasure_stderr": 0.0021746201526808115, "rouge1_precision": 0.06832528700117271, "rouge1_precision_stderr": 0.002020031616020774, "rouge1_recall": 0.24924535874354928, "rouge1_recall_stderr": 0.004710667742123965, "rouge2_fmeasure": 0.04638572157936821, "rouge2_fmeasure_stderr": 0.0013302529021085698, "rouge2_precision": 0.031623603705718525, "rouge2_precision_stderr": 0.0010704082391642267, "rouge2_recall": 0.12321043440007193, "rouge2_recall_stderr": 0.0031299036359367484, "rougeL_fmeasure": 0.09281887400756049, "rougeL_fmeasure_stderr": 0.001998996594014213, "rougeL_precision": 0.06503251084712686, "rougeL_precision_stderr": 0.0018228814683563853, "rougeL_recall": 0.242120358957363, "rougeL_recall_stderr": 0.0045711852361007615, "rougeLsum_fmeasure": 0.09242326380243195, "rougeLsum_fmeasure_stderr": 0.002033441173958409, "rougeLsum_precision": 0.06504280718043812, "rougeLsum_precision_stderr": 0.0018744892527656203, "rougeLsum_recall": 0.23925679819243256, "rougeLsum_recall_stderr": 0.004467359403537819}}, "1": {"PALM_prompt": {"bleu": 0.50071840910503, "bleu_stderr": 0.04570119173979742, "rouge1_fmeasure": 0.11243638843069366, "rouge1_fmeasure_stderr": 0.0020002359883771006, "rouge1_precision": 0.07262405538535584, "rouge1_precision_stderr": 0.0015025709069587576, "rouge1_recall": 0.3545200580306086, "rouge1_recall_stderr": 0.005274328693296472, "rouge2_fmeasure": 0.052828679189574415, "rouge2_fmeasure_stderr": 0.0012458061281784743, "rouge2_precision": 0.03411790558117329, "rouge2_precision_stderr": 0.0009146851141661013, "rouge2_recall": 0.1735824312348198, "rouge2_recall_stderr": 0.0036281888114770787, "rougeL_fmeasure": 0.10640900334612001, "rougeL_fmeasure_stderr": 0.0018168238044968257, "rougeL_precision": 0.06858742604428675, "rougeL_precision_stderr": 0.0013499175148544084, "rougeL_recall": 0.33533941614782714, "rougeL_recall_stderr": 0.00489862350776037, "rougeLsum_fmeasure": 0.10662100746126313, "rougeLsum_fmeasure_stderr": 0.0018761173400960428, "rougeLsum_precision": 0.06889204707695668, "rougeLsum_precision_stderr": 0.0014115064627086959, "rougeLsum_recall": 0.3353795561043291, "rougeLsum_recall_stderr": 0.0048767168213307635}}, "2": {"PALM_prompt": {"bleu": 0.5940498197237445, "bleu_stderr": 0.045360168270571564, "rouge1_fmeasure": 0.11593737527731507, "rouge1_fmeasure_stderr": 0.001949513588062909, "rouge1_precision": 0.07459577331672164, "rouge1_precision_stderr": 0.001595194413101619, "rouge1_recall": 0.38907556546905864, "rouge1_recall_stderr": 0.0054304691316662745, "rouge2_fmeasure": 0.05449451670049787, "rouge2_fmeasure_stderr": 0.0012399615319130626, "rouge2_precision": 0.035114673270987544, "rouge2_precision_stderr": 0.0010081353981524028, "rouge2_recall": 0.19584375989737013, "rouge2_recall_stderr": 0.004008082321996039, "rougeL_fmeasure": 0.10783733998051759, "rougeL_fmeasure_stderr": 0.0017619447024028836, "rougeL_precision": 0.06931044734650851, "rougeL_precision_stderr": 0.001428917200995628, "rougeL_recall": 0.35996374778547313, "rougeL_recall_stderr": 0.0049025475018146785, "rougeLsum_fmeasure": 0.10992722332009006, "rougeLsum_fmeasure_stderr": 0.0018365522736996146, "rougeLsum_precision": 0.07079493663216743, "rougeLsum_precision_stderr": 0.0015053431740885688, "rougeLsum_recall": 0.3670729702378272, "rougeLsum_recall_stderr": 0.00501564179221856}}, "3": {"PALM_prompt": {"bleu": 0.6360838559544998, "bleu_stderr": 0.03653875799485711, "rouge1_fmeasure": 0.11355856417583833, "rouge1_fmeasure_stderr": 0.0017735693070030508, "rouge1_precision": 0.07236011578940099, "rouge1_precision_stderr": 0.0013910988503604417, "rouge1_recall": 0.3928435128894812, "rouge1_recall_stderr": 0.005384069245593369, "rouge2_fmeasure": 0.05302505138393545, "rouge2_fmeasure_stderr": 0.0011266752102525668, "rouge2_precision": 0.033363396708401855, "rouge2_precision_stderr": 0.000803140401456714, "rouge2_recall": 0.19915667523109754, "rouge2_recall_stderr": 0.004011860806625624, "rougeL_fmeasure": 0.1048768596135265, "rougeL_fmeasure_stderr": 0.0016046493945327915, "rougeL_precision": 0.06687767191203486, "rougeL_precision_stderr": 0.0012600410796976124, "rougeL_recall": 0.3592559338816392, "rougeL_recall_stderr": 0.0047582409926964905, "rougeLsum_fmeasure": 0.10707993411906895, "rougeLsum_fmeasure_stderr": 0.0016657550135363104, "rougeLsum_precision": 0.0682810245315365, "rougeLsum_precision_stderr": 0.0013111163093307867, "rougeLsum_recall": 0.3687377662903542, "rougeLsum_recall_stderr": 0.004960565943300161}}, "4": {"PALM_prompt": {"bleu": 0.7027350329668681, "bleu_stderr": 0.0419156836814436, "rouge1_fmeasure": 0.11781335466174786, "rouge1_fmeasure_stderr": 0.0017761239549537581, "rouge1_precision": 0.07411348564826069, "rouge1_precision_stderr": 0.0012900003957059321, "rouge1_recall": 0.40994918124441987, "rouge1_recall_stderr": 0.005364326251363318, "rouge2_fmeasure": 0.05494234282461018, "rouge2_fmeasure_stderr": 0.001125668640595295, "rouge2_precision": 0.03434960773754005, "rouge2_precision_stderr": 0.0007831150259881873, "rouge2_recall": 0.20756196914680475, "rouge2_recall_stderr": 0.004035186327369416, "rougeL_fmeasure": 0.10796130107897399, "rougeL_fmeasure_stderr": 0.0015796761570714324, "rougeL_precision": 0.06799320620725614, "rougeL_precision_stderr": 0.0011446709666944061, "rougeL_recall": 0.37268673451611783, "rougeL_recall_stderr": 0.0047156874564148465, "rougeLsum_fmeasure": 0.11115469880289687, "rougeLsum_fmeasure_stderr": 0.001670218626963014, "rougeLsum_precision": 0.07001046072193005, "rougeLsum_precision_stderr": 0.0012147694362971294, "rougeLsum_recall": 0.38481667508014805, "rougeLsum_recall_stderr": 0.0049365819422690195}}, "5": {"PALM_prompt": {"bleu": 0.7868609033139111, "bleu_stderr": 0.040304136105284674, "rouge1_fmeasure": 0.12053057909919479, "rouge1_fmeasure_stderr": 0.0017893350216202717, "rouge1_precision": 0.07559215659324663, "rouge1_precision_stderr": 0.0013368561782119886, "rouge1_recall": 0.4322159752184723, "rouge1_recall_stderr": 0.005507418892615091, "rouge2_fmeasure": 0.05660554273746949, "rouge2_fmeasure_stderr": 0.0011398496071974332, "rouge2_precision": 0.03531136059294999, "rouge2_precision_stderr": 0.0008350132401271129, "rouge2_recall": 0.22153138326278524, "rouge2_recall_stderr": 0.004197864843142094, "rougeL_fmeasure": 0.1089770281372542, "rougeL_fmeasure_stderr": 0.0015496587154532736, "rougeL_precision": 0.0683868166969706, "rougeL_precision_stderr": 0.0011383097189411916, "rougeL_recall": 0.387608398091316, "rougeL_recall_stderr": 0.004759866077492818, "rougeLsum_fmeasure": 0.11313127630281221, "rougeLsum_fmeasure_stderr": 0.0016759508017601735, "rougeLsum_precision": 0.07105854805056085, "rougeLsum_precision_stderr": 0.0012600458442573047, "rougeLsum_recall": 0.40338460571037565, "rougeLsum_recall_stderr": 0.005019831598860692}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5100556841549206, "bleu_stderr": 0.056240452574111845, "rouge1_fmeasure": 0.1749169386171503, "rouge1_fmeasure_stderr": 0.0018652551820908833, "rouge1_precision": 0.14983850803367624, "rouge1_precision_stderr": 0.0018751672389164477, "rouge1_recall": 0.2550580803178272, "rouge1_recall_stderr": 0.0027794243641439476, "rouge2_fmeasure": 0.035914103574269995, "rouge2_fmeasure_stderr": 0.0008571739488027627, "rouge2_precision": 0.030382874922609843, "rouge2_precision_stderr": 0.0007519667664767079, "rouge2_recall": 0.054505775605249815, "rouge2_recall_stderr": 0.0014473521383030284, "rougeL_fmeasure": 0.13780562967661653, "rougeL_fmeasure_stderr": 0.001358236237017717, "rougeL_precision": 0.11682287834978397, "rougeL_precision_stderr": 0.0013372228496992414, "rougeL_recall": 0.20560214095840312, "rougeL_recall_stderr": 0.0022848479838345685, "rougeLsum_fmeasure": 0.16059954854119637, "rougeLsum_fmeasure_stderr": 0.0017011067980149748, "rougeLsum_precision": 0.13742857779903467, "rougeLsum_precision_stderr": 0.0017113854895127892, "rougeLsum_recall": 0.23501911131597872, "rougeLsum_recall_stderr": 0.0025756220873118974}}, "1": {"tldr_en": {"bleu": 2.965142818939576, "bleu_stderr": 0.06637064806778528, "rouge1_fmeasure": 0.22285165404429813, "rouge1_fmeasure_stderr": 0.0019735517320632746, "rouge1_precision": 0.1965390881322699, "rouge1_precision_stderr": 0.0022899400776314653, "rouge1_recall": 0.3206094100455321, "rouge1_recall_stderr": 0.002849589854137116, "rouge2_fmeasure": 0.056001917384198796, "rouge2_fmeasure_stderr": 0.0010665112573503633, "rouge2_precision": 0.05045558070635316, "rouge2_precision_stderr": 0.0011660432710824278, "rouge2_recall": 0.08246262292072058, "rouge2_recall_stderr": 0.0017565984624132588, "rougeL_fmeasure": 0.15830852566063647, "rougeL_fmeasure_stderr": 0.0013457659853471373, "rougeL_precision": 0.13895822267839106, "rougeL_precision_stderr": 0.0016253397048819503, "rougeL_recall": 0.23337530287957864, "rougeL_recall_stderr": 0.0022628070507281664, "rougeLsum_fmeasure": 0.20945729941888908, "rougeLsum_fmeasure_stderr": 0.0018462891651681006, "rougeLsum_precision": 0.18479852886338183, "rougeLsum_precision_stderr": 0.0021643241999535143, "rougeLsum_recall": 0.3019972782519105, "rougeLsum_recall_stderr": 0.002703215847122071}}, "2": {"tldr_en": {"bleu": 3.092360521821163, "bleu_stderr": 0.08937218000390199, "rouge1_fmeasure": 0.2236964233973595, "rouge1_fmeasure_stderr": 0.0019023763735918212, "rouge1_precision": 0.20155746658889173, "rouge1_precision_stderr": 0.0023374350511286197, "rouge1_recall": 0.3173353129081016, "rouge1_recall_stderr": 0.002820895911494938, "rouge2_fmeasure": 0.05753113933914173, "rouge2_fmeasure_stderr": 0.0010826117471515486, "rouge2_precision": 0.05269972304999503, "rouge2_precision_stderr": 0.001197033866295586, "rouge2_recall": 0.08395356718136517, "rouge2_recall_stderr": 0.0017436680982696616, "rougeL_fmeasure": 0.16108340919074526, "rougeL_fmeasure_stderr": 0.001357852843091288, "rougeL_precision": 0.14470120145484863, "rougeL_precision_stderr": 0.0017283958163244364, "rougeL_recall": 0.233378725309496, "rougeL_recall_stderr": 0.0023004114225280213, "rougeLsum_fmeasure": 0.21101035224845988, "rougeLsum_fmeasure_stderr": 0.001793590665001926, "rougeLsum_precision": 0.19006446021144513, "rougeLsum_precision_stderr": 0.002218603404878938, "rougeLsum_recall": 0.30023601631924435, "rougeLsum_recall_stderr": 0.002704859202966948}}, "3": {"tldr_en": {"bleu": 2.9732444729240193, "bleu_stderr": 0.09147492615114675, "rouge1_fmeasure": 0.1865213351213735, "rouge1_fmeasure_stderr": 0.0022753171506905965, "rouge1_precision": 0.17379375635824176, "rouge1_precision_stderr": 0.0026246110896642268, "rouge1_recall": 0.2638936090552038, "rouge1_recall_stderr": 0.003388128200410466, "rouge2_fmeasure": 0.04592788875910395, "rouge2_fmeasure_stderr": 0.0010286025903021396, "rouge2_precision": 0.04282623790563042, "rouge2_precision_stderr": 0.0011568698796170502, "rouge2_recall": 0.06755023246625483, "rouge2_recall_stderr": 0.0016980876475847096, "rougeL_fmeasure": 0.13306544091830314, "rougeL_fmeasure_stderr": 0.0015985646389652382, "rougeL_precision": 0.12470876767206059, "rougeL_precision_stderr": 0.00198989611192214, "rougeL_recall": 0.1925552246827813, "rougeL_recall_stderr": 0.0026451539087018585, "rougeLsum_fmeasure": 0.1760787377574676, "rougeLsum_fmeasure_stderr": 0.0021410940507585008, "rougeLsum_precision": 0.1641047927743934, "rougeLsum_precision_stderr": 0.0024936634430607935, "rougeLsum_recall": 0.24985286020591305, "rougeLsum_recall_stderr": 0.0032309944265283695}}, "4": {"tldr_en": {"bleu": 0.6533679656816203, "bleu_stderr": 0.04947451807737188, "rouge1_fmeasure": 0.0605919538076038, "rouge1_fmeasure_stderr": 0.0020099409218817697, "rouge1_precision": 0.059448771232523664, "rouge1_precision_stderr": 0.002218569421407597, "rouge1_recall": 0.08822441141978317, "rouge1_recall_stderr": 0.003020915797584135, "rouge2_fmeasure": 0.015593229425275027, "rouge2_fmeasure_stderr": 0.0007474058630918573, "rouge2_precision": 0.014949718004119315, "rouge2_precision_stderr": 0.0008976901887326936, "rouge2_recall": 0.02428038352612278, "rouge2_recall_stderr": 0.001276025523659211, "rougeL_fmeasure": 0.04486687523926569, "rougeL_fmeasure_stderr": 0.0014834001600610909, "rougeL_precision": 0.0446701681314208, "rougeL_precision_stderr": 0.0017283868647508462, "rougeL_recall": 0.06670046801956148, "rougeL_recall_stderr": 0.002362351165144592, "rougeLsum_fmeasure": 0.057104894137255724, "rougeLsum_fmeasure_stderr": 0.0018950436973998253, "rougeLsum_precision": 0.056074265222838884, "rougeLsum_precision_stderr": 0.0020960296913534976, "rougeLsum_recall": 0.08343237936875103, "rougeLsum_recall_stderr": 0.00287322955407947}}, "5": {"tldr_en": {"bleu": 4.1457161238536384e-07, "bleu_stderr": 9.147191188223124e-07, "rouge1_fmeasure": 0.00929579656840014, "rouge1_fmeasure_stderr": 0.0008648371348177522, "rouge1_precision": 0.009348211882001287, "rouge1_precision_stderr": 0.0009832878517389112, "rouge1_recall": 0.013688335464355616, "rouge1_recall_stderr": 0.0013308943056046898, "rouge2_fmeasure": 0.002384610836596322, "rouge2_fmeasure_stderr": 0.00028677749720556095, "rouge2_precision": 0.002530983918335585, "rouge2_precision_stderr": 0.00045036458463964445, "rouge2_recall": 0.00396277708838197, "rouge2_recall_stderr": 0.0005637401793403029, "rougeL_fmeasure": 0.006964849797303445, "rougeL_fmeasure_stderr": 0.0006505850643049141, "rougeL_precision": 0.007189697647068899, "rougeL_precision_stderr": 0.0008082293363466722, "rougeL_recall": 0.010450449827034726, "rougeL_recall_stderr": 0.0010632636910403864, "rougeLsum_fmeasure": 0.008747681514956059, "rougeLsum_fmeasure_stderr": 0.0008159791184455816, "rougeLsum_precision": 0.008774341101660414, "rougeLsum_precision_stderr": 0.0009302597680921362, "rougeLsum_recall": 0.012985553018433522, "rougeLsum_recall_stderr": 0.001276501870842985}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.37083773151234173, "bleu_stderr": 0.023770267793187624, "rouge1_fmeasure": 0.08851879719705498, "rouge1_fmeasure_stderr": 0.0008401305153826185, "rouge1_precision": 0.06608746744853429, "rouge1_precision_stderr": 0.0006810687592344808, "rouge1_recall": 0.14195087368704973, "rouge1_recall_stderr": 0.0012610802186243078, "rouge2_fmeasure": 0.0045379311452906496, "rouge2_fmeasure_stderr": 0.00042315898801261495, "rouge2_precision": 0.0034258087224012515, "rouge2_precision_stderr": 0.00032514807417983457, "rouge2_recall": 0.00704033391818721, "rouge2_recall_stderr": 0.0006361810577325418, "rougeL_fmeasure": 0.08820722536535333, "rougeL_fmeasure_stderr": 0.0008208954817151469, "rougeL_precision": 0.06585617168656571, "rougeL_precision_stderr": 0.0006670941024074856, "rougeL_recall": 0.14144906524240433, "rougeL_recall_stderr": 0.0012314615868737277, "rougeLsum_fmeasure": 0.07382731521584816, "rougeLsum_fmeasure_stderr": 0.0006992840645883968, "rougeLsum_precision": 0.05497072058739539, "rougeLsum_precision_stderr": 0.0005636175471074675, "rougeLsum_recall": 0.11929179350942047, "rougeLsum_recall_stderr": 0.0010831868664373235}}, "1": {"generate_text_restaurant": {"bleu": 11.210308239038657, "bleu_stderr": 0.13532926187041483, "rouge1_fmeasure": 0.44364097425563637, "rouge1_fmeasure_stderr": 0.0024213391823928623, "rouge1_precision": 0.5337134369290545, "rouge1_precision_stderr": 0.0033441618301280323, "rouge1_recall": 0.4186112103011673, "rouge1_recall_stderr": 0.003029511762554002, "rouge2_fmeasure": 0.20464247864479315, "rouge2_fmeasure_stderr": 0.0019524189816088483, "rouge2_precision": 0.2503358019053269, "rouge2_precision_stderr": 0.0025907821242386786, "rouge2_recall": 0.19267261882751974, "rouge2_recall_stderr": 0.002098816040806122, "rougeL_fmeasure": 0.3179154131738048, "rougeL_fmeasure_stderr": 0.002060565311895604, "rougeL_precision": 0.3856908650245338, "rougeL_precision_stderr": 0.002931334041050201, "rougeL_recall": 0.2989387799212162, "rougeL_recall_stderr": 0.0024147108908862333, "rougeLsum_fmeasure": 0.35856203407161186, "rougeLsum_fmeasure_stderr": 0.0023230420120309756, "rougeLsum_precision": 0.4329536166939937, "rougeLsum_precision_stderr": 0.0031784168539919287, "rougeLsum_recall": 0.33757778280843753, "rougeLsum_recall_stderr": 0.0027140890148101187}}, "2": {"generate_text_restaurant": {"bleu": 12.855010359789356, "bleu_stderr": 0.1433052434909132, "rouge1_fmeasure": 0.4715984211104789, "rouge1_fmeasure_stderr": 0.002279297442582234, "rouge1_precision": 0.5659453062773773, "rouge1_precision_stderr": 0.003270807439335411, "rouge1_recall": 0.4426574876800998, "rouge1_recall_stderr": 0.002892096018797496, "rouge2_fmeasure": 0.22811726769856144, "rouge2_fmeasure_stderr": 0.0019828962661021204, "rouge2_precision": 0.27838189797515667, "rouge2_precision_stderr": 0.002671533749085893, "rouge2_recall": 0.21363966698505715, "rouge2_recall_stderr": 0.0021189348260758872, "rougeL_fmeasure": 0.3403464767676166, "rougeL_fmeasure_stderr": 0.0020236690372816674, "rougeL_precision": 0.4103597152277875, "rougeL_precision_stderr": 0.002911142472749296, "rougeL_recall": 0.319223478072641, "rougeL_recall_stderr": 0.0023906932022469385, "rougeLsum_fmeasure": 0.38431431509550784, "rougeLsum_fmeasure_stderr": 0.0022508604583330134, "rougeLsum_precision": 0.461695670382684, "rougeLsum_precision_stderr": 0.0031155745746790013, "rougeLsum_recall": 0.36069384550050293, "rougeLsum_recall_stderr": 0.002666755684059944}}, "3": {"generate_text_restaurant": {"bleu": 13.64156249802983, "bleu_stderr": 0.15061545188803951, "rouge1_fmeasure": 0.48080915530796386, "rouge1_fmeasure_stderr": 0.002234979483832272, "rouge1_precision": 0.5750153314520224, "rouge1_precision_stderr": 0.0031859723554921284, "rouge1_recall": 0.4510440289695527, "rouge1_recall_stderr": 0.0028677830005739494, "rouge2_fmeasure": 0.23831207673207944, "rouge2_fmeasure_stderr": 0.002020312344935205, "rouge2_precision": 0.28884561820039295, "rouge2_precision_stderr": 0.002639498857757876, "rouge2_recall": 0.22357397871649595, "rouge2_recall_stderr": 0.0022043371695741783, "rougeL_fmeasure": 0.3492803919842545, "rougeL_fmeasure_stderr": 0.0020660644514658923, "rougeL_precision": 0.4195446580367559, "rougeL_precision_stderr": 0.0029133142431469273, "rougeL_recall": 0.3276080175727646, "rougeL_recall_stderr": 0.0024527144670301914, "rougeLsum_fmeasure": 0.39480073772752605, "rougeLsum_fmeasure_stderr": 0.0022686218848066435, "rougeLsum_precision": 0.4727009044140323, "rougeLsum_precision_stderr": 0.0031160796738222674, "rougeLsum_recall": 0.37055635583114344, "rougeLsum_recall_stderr": 0.0027125904536736854}}, "4": {"generate_text_restaurant": {"bleu": 13.48978762764689, "bleu_stderr": 0.18010374224713105, "rouge1_fmeasure": 0.48299936414020656, "rouge1_fmeasure_stderr": 0.0021956091228780565, "rouge1_precision": 0.5783755973364397, "rouge1_precision_stderr": 0.003188421667895018, "rouge1_recall": 0.4521748484155905, "rouge1_recall_stderr": 0.0028320142916950963, "rouge2_fmeasure": 0.23895749463268698, "rouge2_fmeasure_stderr": 0.00196124259284921, "rouge2_precision": 0.2905353846413916, "rouge2_precision_stderr": 0.0026426279354766213, "rouge2_recall": 0.22353380032103162, "rouge2_recall_stderr": 0.0021373490578138086, "rougeL_fmeasure": 0.347484945544206, "rougeL_fmeasure_stderr": 0.002079636990372097, "rougeL_precision": 0.4173159956369907, "rougeL_precision_stderr": 0.0029141064950507158, "rougeL_recall": 0.3255911950461323, "rougeL_recall_stderr": 0.0024733626405686934, "rougeLsum_fmeasure": 0.3952584823764592, "rougeLsum_fmeasure_stderr": 0.00227735538013133, "rougeLsum_precision": 0.4733193294500394, "rougeLsum_precision_stderr": 0.003118544106260605, "rougeLsum_recall": 0.3704278657572045, "rougeLsum_recall_stderr": 0.0027237053634839347}}, "5": {"generate_text_restaurant": {"bleu": 13.585319575612113, "bleu_stderr": 0.15871453640685637, "rouge1_fmeasure": 0.48786585090169554, "rouge1_fmeasure_stderr": 0.00223672743807504, "rouge1_precision": 0.5799683476103964, "rouge1_precision_stderr": 0.0031994912520596636, "rouge1_recall": 0.45706214916858773, "rouge1_recall_stderr": 0.002844413918683844, "rouge2_fmeasure": 0.24237216469601944, "rouge2_fmeasure_stderr": 0.0019981286651549568, "rouge2_precision": 0.29236226184175695, "rouge2_precision_stderr": 0.0026385490403355457, "rouge2_recall": 0.22695605327657692, "rouge2_recall_stderr": 0.0021727413185400174, "rougeL_fmeasure": 0.34850083462923376, "rougeL_fmeasure_stderr": 0.002084486638424466, "rougeL_precision": 0.4148253441054077, "rougeL_precision_stderr": 0.00283534958682861, "rougeL_recall": 0.32700945528721864, "rougeL_recall_stderr": 0.002473535129442898, "rougeLsum_fmeasure": 0.39837100157486816, "rougeLsum_fmeasure_stderr": 0.002290117281740733, "rougeLsum_precision": 0.4734672508470655, "rougeLsum_precision_stderr": 0.0030825263814209243, "rougeLsum_recall": 0.37359433682039755, "rougeLsum_recall_stderr": 0.0027168498928153875}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.1113783413326344, "bleu_stderr": 0.11912014552832849, "rouge1_fmeasure": 0.2136631527247442, "rouge1_fmeasure_stderr": 0.0026638229066998256, "rouge1_precision": 0.16876091355105496, "rouge1_precision_stderr": 0.002428477465457079, "rouge1_recall": 0.3341076035740682, "rouge1_recall_stderr": 0.0046048681292224745, "rouge2_fmeasure": 0.04856213249829562, "rouge2_fmeasure_stderr": 0.0017561436649774235, "rouge2_precision": 0.03747194366522528, "rouge2_precision_stderr": 0.0014955773574022288, "rouge2_recall": 0.07911851428210626, "rouge2_recall_stderr": 0.002822698047160193, "rougeL_fmeasure": 0.1610160340198798, "rougeL_fmeasure_stderr": 0.0021142466349326873, "rougeL_precision": 0.1271557431168007, "rougeL_precision_stderr": 0.0019539789800342004, "rougeL_recall": 0.25284226031166046, "rougeL_recall_stderr": 0.0036682449427253147, "rougeLsum_fmeasure": 0.16787123550272315, "rougeLsum_fmeasure_stderr": 0.0023335572634387495, "rougeLsum_precision": 0.13217520866885155, "rougeLsum_precision_stderr": 0.002071100870900028, "rougeLsum_recall": 0.26428103009111936, "rougeLsum_recall_stderr": 0.004079358230845003}}, "1": {"article_DOC_summary": {"bleu": 1.4807516008262442, "bleu_stderr": 0.10561063787286398, "rouge1_fmeasure": 0.18170879656881125, "rouge1_fmeasure_stderr": 0.00245218614539981, "rouge1_precision": 0.12927526554265445, "rouge1_precision_stderr": 0.0018265553859337874, "rouge1_recall": 0.3185510498370038, "rouge1_recall_stderr": 0.004229864775990553, "rouge2_fmeasure": 0.03694944789541673, "rouge2_fmeasure_stderr": 0.001451072652969939, "rouge2_precision": 0.02602935902165096, "rouge2_precision_stderr": 0.001023224048321341, "rouge2_recall": 0.06652536681300213, "rouge2_recall_stderr": 0.002722006503197127, "rougeL_fmeasure": 0.14062720750461896, "rougeL_fmeasure_stderr": 0.0018793551451232017, "rougeL_precision": 0.09989029850197624, "rougeL_precision_stderr": 0.0013879057374961837, "rougeL_recall": 0.2479107129331233, "rougeL_recall_stderr": 0.003381030448367971, "rougeLsum_fmeasure": 0.14570314466301257, "rougeLsum_fmeasure_stderr": 0.002064223760081791, "rougeLsum_precision": 0.10341765201003357, "rougeLsum_precision_stderr": 0.0015149831274551388, "rougeLsum_recall": 0.2571991639086138, "rougeLsum_recall_stderr": 0.003711770412976227}}, "2": {"article_DOC_summary": {"bleu": 1.6562203262313626, "bleu_stderr": 0.07982304015370187, "rouge1_fmeasure": 0.1847484789507431, "rouge1_fmeasure_stderr": 0.0024906095591687542, "rouge1_precision": 0.13145946154632493, "rouge1_precision_stderr": 0.0018544974879548131, "rouge1_recall": 0.32387324210035007, "rouge1_recall_stderr": 0.004267761663343424, "rouge2_fmeasure": 0.03989446165864525, "rouge2_fmeasure_stderr": 0.001470552354617615, "rouge2_precision": 0.02808308652134632, "rouge2_precision_stderr": 0.0010347320981807847, "rouge2_recall": 0.07187873077263424, "rouge2_recall_stderr": 0.0027472238907884166, "rougeL_fmeasure": 0.14356843077788037, "rougeL_fmeasure_stderr": 0.0018927425748884437, "rougeL_precision": 0.1019161886732123, "rougeL_precision_stderr": 0.001391499897049356, "rougeL_recall": 0.2535896765342832, "rougeL_recall_stderr": 0.0034183375598958493, "rougeLsum_fmeasure": 0.14741895358409335, "rougeLsum_fmeasure_stderr": 0.0020743741796644924, "rougeLsum_precision": 0.10460558980438973, "rougeLsum_precision_stderr": 0.0015202856124887812, "rougeLsum_recall": 0.2605683575765448, "rougeLsum_recall_stderr": 0.0037100189499078345}}, "3": {"article_DOC_summary": {"bleu": 1.7657291524477807, "bleu_stderr": 0.11405030461208435, "rouge1_fmeasure": 0.18390048441125048, "rouge1_fmeasure_stderr": 0.0026657496989706104, "rouge1_precision": 0.13318228571463592, "rouge1_precision_stderr": 0.0020455353549147103, "rouge1_recall": 0.317018855215892, "rouge1_recall_stderr": 0.004518876538639268, "rouge2_fmeasure": 0.040350616115479146, "rouge2_fmeasure_stderr": 0.0014683337326124046, "rouge2_precision": 0.028785220783836416, "rouge2_precision_stderr": 0.0010625475528616283, "rouge2_recall": 0.07136621731857556, "rouge2_recall_stderr": 0.0026618058930387434, "rougeL_fmeasure": 0.14086692618233734, "rougeL_fmeasure_stderr": 0.0020271972409091304, "rougeL_precision": 0.10172139078346341, "rougeL_precision_stderr": 0.0015298521300718705, "rougeL_recall": 0.2445625759480037, "rougeL_recall_stderr": 0.003569632794081247, "rougeLsum_fmeasure": 0.14729860857759783, "rougeLsum_fmeasure_stderr": 0.002200650446133668, "rougeLsum_precision": 0.10639989816115737, "rougeLsum_precision_stderr": 0.0016639989017194455, "rougeLsum_recall": 0.255751047100679, "rougeLsum_recall_stderr": 0.0038734069167402317}}, "4": {"article_DOC_summary": {"bleu": 0.7368905267923197, "bleu_stderr": 0.10568292014256596, "rouge1_fmeasure": 0.04858562888319941, "rouge1_fmeasure_stderr": 0.0027184130065024178, "rouge1_precision": 0.0415260420063545, "rouge1_precision_stderr": 0.0026056508945012298, "rouge1_recall": 0.07757055211542944, "rouge1_recall_stderr": 0.004506312599731837, "rouge2_fmeasure": 0.009949994948206649, "rouge2_fmeasure_stderr": 0.0008716091074829112, "rouge2_precision": 0.0074966026006513965, "rouge2_precision_stderr": 0.0006746135809758764, "rouge2_recall": 0.0173613412544955, "rouge2_recall_stderr": 0.0015976731193386722, "rougeL_fmeasure": 0.036593331243362004, "rougeL_fmeasure_stderr": 0.0020450096289478343, "rougeL_precision": 0.03194572657523972, "rougeL_precision_stderr": 0.002158988118941094, "rougeL_recall": 0.05898387332591097, "rougeL_recall_stderr": 0.003484814745152857, "rougeLsum_fmeasure": 0.0385888024512549, "rougeLsum_fmeasure_stderr": 0.002163672046764109, "rougeLsum_precision": 0.033578484743865225, "rougeLsum_precision_stderr": 0.002237487835101602, "rougeLsum_recall": 0.06211470890687305, "rougeLsum_recall_stderr": 0.0036822940469355132}}, "5": {"article_DOC_summary": {"bleu": 1.4041260567643717e-38, "bleu_stderr": 1.3322404897773207e-33, "rouge1_fmeasure": 0.0027194147089883844, "rouge1_fmeasure_stderr": 0.0007296230437372524, "rouge1_precision": 0.0029927391798704375, "rouge1_precision_stderr": 0.0008124604444580753, "rouge1_recall": 0.002590023053043221, "rouge1_recall_stderr": 0.0006974154226908588, "rouge2_fmeasure": 0.00021508550838189274, "rouge2_fmeasure_stderr": 9.668205684028845e-05, "rouge2_precision": 0.000247284899936924, "rouge2_precision_stderr": 0.00011058519545078262, "rouge2_recall": 0.00019440465195182176, "rouge2_recall_stderr": 8.882271405728297e-05, "rougeL_fmeasure": 0.0018356640850194482, "rougeL_fmeasure_stderr": 0.0004879741431962152, "rougeL_precision": 0.002007763097414947, "rougeL_precision_stderr": 0.0005311869704827251, "rougeL_recall": 0.0017507054146174415, "rougeL_recall_stderr": 0.0004724443085314579, "rougeLsum_fmeasure": 0.0020761523840976182, "rougeLsum_fmeasure_stderr": 0.0005629570167031783, "rougeLsum_precision": 0.0022522452753482606, "rougeLsum_precision_stderr": 0.0006086895267107963, "rougeLsum_recall": 0.0020127265562168294, "rougeLsum_recall_stderr": 0.0005598466140344547}}}} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..743a45fb0978396b5d9856a43a162f639901fa4f --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.329,0.014865395385928364,0 +anli_r2,acc,0.326,0.01483050720454104,0 +anli_r3,acc,0.3433333333333333,0.01371263383046586,0 +arc_challenge,acc,0.26706484641638223,0.012928933196496354,0 +arc_challenge,acc_norm,0.28924914675767915,0.013250012579393443,0 +arc_easy,acc,0.6014309764309764,0.010046455400477937,0 +arc_easy,acc_norm,0.531986531986532,0.010238767643185714,0 +boolq,acc,0.5825688073394495,0.00862499005021668,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2127316680096696,,1 +copa,acc,0.79,0.040936018074033256,0 +hellaswag,acc,0.47400916152160927,0.00498303542023572,0 +hellaswag,acc_norm,0.6269667396932882,0.004826224784850451,0 +piqa,acc,0.7551686615886833,0.010032309105568795,0 +piqa,acc_norm,0.763873775843308,0.009908965890558218,0 +rte,acc,0.5595667870036101,0.02988212336311872,0 +sciq,acc,0.85,0.011297239823409314,0 +sciq,acc_norm,0.752,0.013663187134877637,0 +storycloze_2016,acc,0.7167290219134153,0.010419760409155363,0 +winogrande,acc,0.5864246250986582,0.013840971763195304,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json deleted file mode 100644 index cc43c07c112a8ca7433f2a5f4da0e4ed468fe6c9..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_0_lm-eval_global_step80108_2023-02-22-18-53-17_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.329, - "acc_stderr": 0.014865395385928364 - }, - "anli_r2": { - "acc": 0.326, - "acc_stderr": 0.01483050720454104 - }, - "anli_r3": { - "acc": 0.3433333333333333, - "acc_stderr": 0.01371263383046586 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2127316680096696 - }, - "copa": { - "acc": 0.79, - "acc_stderr": 0.040936018074033256 - }, - "hellaswag": { - "acc": 0.47400916152160927, - "acc_stderr": 0.00498303542023572, - "acc_norm": 0.6269667396932882, - "acc_norm_stderr": 0.004826224784850451 - }, - "rte": { - "acc": 0.5595667870036101, - "acc_stderr": 0.02988212336311872 - }, - "winogrande": { - "acc": 0.5864246250986582, - "acc_stderr": 0.013840971763195304 - }, - "storycloze_2016": { - "acc": 0.7167290219134153, - "acc_stderr": 0.010419760409155363 - }, - "boolq": { - "acc": 0.5825688073394495, - "acc_stderr": 0.00862499005021668 - }, - "arc_easy": { - "acc": 0.6014309764309764, - "acc_stderr": 0.010046455400477937, - "acc_norm": 0.531986531986532, - "acc_norm_stderr": 0.010238767643185714 - }, - "arc_challenge": { - "acc": 0.26706484641638223, - "acc_stderr": 0.012928933196496354, - "acc_norm": 0.28924914675767915, - "acc_norm_stderr": 0.013250012579393443 - }, - "sciq": { - "acc": 0.85, - "acc_stderr": 0.011297239823409314, - "acc_norm": 0.752, - "acc_norm_stderr": 0.013663187134877637 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568795, - "acc_norm": 0.763873775843308, - "acc_norm_stderr": 0.009908965890558218 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..0291f7ac81a6eec29b99ef4d3e9a4c6152e116e3 --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.327,0.01484221315341124,0 +anli_r2,acc,0.332,0.014899597242811492,0 +anli_r3,acc,0.34,0.01368049572576779,0 +arc_challenge,acc,0.28071672354948807,0.013131238126975586,0 +arc_challenge,acc_norm,0.31313993174061433,0.013552671543623504,0 +arc_easy,acc,0.6031144781144782,0.010039236800583209,0 +arc_easy,acc_norm,0.5723905723905723,0.010151683397430673,0 +boolq,acc,0.5788990825688073,0.008635491562221344,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.35057471264367807,,1 +copa,acc,0.8,0.040201512610368445,0 +hellaswag,acc,0.4731129257120096,0.0049825618152141244,0 +hellaswag,acc_norm,0.6270663214499104,0.004825963768772216,0 +piqa,acc,0.7589771490750816,0.009979042717267314,0 +piqa,acc_norm,0.7616974972796517,0.009940334245876219,0 +rte,acc,0.5306859205776173,0.03003973059219781,0 +sciq,acc,0.892,0.009820001651345696,0 +sciq,acc_norm,0.89,0.009899393819724446,0 +storycloze_2016,acc,0.7140566541956174,0.010449259851345842,0 +winogrande,acc,0.574585635359116,0.013895257666646378,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json deleted file mode 100644 index 0177e9b075d745bdb2f14fe8385744b2e4ab086c..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_1_lm-eval_global_step80108_2023-02-22-18-53-17_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.327, - "acc_stderr": 0.01484221315341124 - }, - "anli_r2": { - "acc": 0.332, - "acc_stderr": 0.014899597242811492 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.01368049572576779 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.35057471264367807 - }, - "copa": { - "acc": 0.8, - "acc_stderr": 0.040201512610368445 - }, - "hellaswag": { - "acc": 0.4731129257120096, - "acc_stderr": 0.0049825618152141244, - "acc_norm": 0.6270663214499104, - "acc_norm_stderr": 0.004825963768772216 - }, - "rte": { - "acc": 0.5306859205776173, - "acc_stderr": 0.03003973059219781 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646378 - }, - "storycloze_2016": { - "acc": 0.7140566541956174, - "acc_stderr": 0.010449259851345842 - }, - "boolq": { - "acc": 0.5788990825688073, - "acc_stderr": 0.008635491562221344 - }, - "arc_easy": { - "acc": 0.6031144781144782, - "acc_stderr": 0.010039236800583209, - "acc_norm": 0.5723905723905723, - "acc_norm_stderr": 0.010151683397430673 - }, - "arc_challenge": { - "acc": 0.28071672354948807, - "acc_stderr": 0.013131238126975586, - "acc_norm": 0.31313993174061433, - "acc_norm_stderr": 0.013552671543623504 - }, - "sciq": { - "acc": 0.892, - "acc_stderr": 0.009820001651345696, - "acc_norm": 0.89, - "acc_norm_stderr": 0.009899393819724446 - }, - "piqa": { - "acc": 0.7589771490750816, - "acc_stderr": 0.009979042717267314, - "acc_norm": 0.7616974972796517, - "acc_norm_stderr": 0.009940334245876219 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..0bf9d233ce10712b795ba2a6a0a3ff47705f4b7e --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.308,0.014606483127342761,0 +anli_r2,acc,0.334,0.014922019523732965,0 +anli_r3,acc,0.32166666666666666,0.013490095282989521,0 +arc_challenge,acc,0.28668941979522183,0.013214986329274774,0 +arc_challenge,acc_norm,0.310580204778157,0.01352229209805305,0 +arc_easy,acc,0.6212121212121212,0.009953737656542037,0 +arc_easy,acc_norm,0.5963804713804713,0.010067368960348216,0 +boolq,acc,0.6039755351681957,0.008553881336813415,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.24493628437290407,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.47122087233618803,0.0049815090992763504,0 +hellaswag,acc_norm,0.6280621390161323,0.004823341569605425,0 +piqa,acc,0.7540805223068553,0.01004733186562519,0 +piqa,acc_norm,0.764961915125136,0.009893146688805315,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.903,0.009363689373248102,0 +sciq,acc_norm,0.893,0.009779910359847167,0 +storycloze_2016,acc,0.7167290219134153,0.010419760409155363,0 +winogrande,acc,0.590370955011839,0.013821049109655478,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json deleted file mode 100644 index 1dfd092b583e9761502d727edc5d9b55afa07bf0..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_2_lm-eval_global_step80108_2023-02-22-18-53-17_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.308, - "acc_stderr": 0.014606483127342761 - }, - "anli_r2": { - "acc": 0.334, - "acc_stderr": 0.014922019523732965 - }, - "anli_r3": { - "acc": 0.32166666666666666, - "acc_stderr": 0.013490095282989521 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.24493628437290407 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.47122087233618803, - "acc_stderr": 0.0049815090992763504, - "acc_norm": 0.6280621390161323, - "acc_norm_stderr": 0.004823341569605425 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.590370955011839, - "acc_stderr": 0.013821049109655478 - }, - "storycloze_2016": { - "acc": 0.7167290219134153, - "acc_stderr": 0.010419760409155363 - }, - "boolq": { - "acc": 0.6039755351681957, - "acc_stderr": 0.008553881336813415 - }, - "arc_easy": { - "acc": 0.6212121212121212, - "acc_stderr": 0.009953737656542037, - "acc_norm": 0.5963804713804713, - "acc_norm_stderr": 0.010067368960348216 - }, - "arc_challenge": { - "acc": 0.28668941979522183, - "acc_stderr": 0.013214986329274774, - "acc_norm": 0.310580204778157, - "acc_norm_stderr": 0.01352229209805305 - }, - "sciq": { - "acc": 0.903, - "acc_stderr": 0.009363689373248102, - "acc_norm": 0.893, - "acc_norm_stderr": 0.009779910359847167 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.01004733186562519, - "acc_norm": 0.764961915125136, - "acc_norm_stderr": 0.009893146688805315 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..e25e487c45c8e9a667ed44c5d7d398f9774bcfc1 --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057121,0 +anli_r2,acc,0.347,0.015060472031706622,0 +anli_r3,acc,0.3308333333333333,0.013588208070709007,0 +arc_challenge,acc,0.29948805460750855,0.013385021637313562,0 +arc_challenge,acc_norm,0.31569965870307165,0.013582571095815291,0 +arc_easy,acc,0.6321548821548821,0.00989492346445519,0 +arc_easy,acc_norm,0.6123737373737373,0.009997307914447612,0 +boolq,acc,0.6030581039755352,0.008557276964675146,1 +cb,acc,0.5178571428571429,0.06737697508644647,1 +cb,f1,0.39707602339181286,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.47211710814578767,0.004982016702445961,0 +hellaswag,acc_norm,0.6292571200955985,0.004820166002253063,0 +piqa,acc,0.7611534276387377,0.0099481203853375,0 +piqa,acc_norm,0.7611534276387377,0.009948120385337484,0 +rte,acc,0.5234657039711191,0.03006330041190266,0 +sciq,acc,0.915,0.008823426366942314,0 +sciq,acc_norm,0.909,0.009099549538400241,0 +storycloze_2016,acc,0.7258150721539284,0.010316062787590011,0 +winogrande,acc,0.5943172849250198,0.013800206336014208,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json deleted file mode 100644 index ce5018faeab2ab62373880a3bcd2c7c134ed6af9..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_3_lm-eval_global_step80108_2023-02-22-18-53-17_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057121 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706622 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070709007 - }, - "cb": { - "acc": 0.5178571428571429, - "acc_stderr": 0.06737697508644647, - "f1": 0.39707602339181286 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.47211710814578767, - "acc_stderr": 0.004982016702445961, - "acc_norm": 0.6292571200955985, - "acc_norm_stderr": 0.004820166002253063 - }, - "rte": { - "acc": 0.5234657039711191, - "acc_stderr": 0.03006330041190266 - }, - "winogrande": { - "acc": 0.5943172849250198, - "acc_stderr": 0.013800206336014208 - }, - "storycloze_2016": { - "acc": 0.7258150721539284, - "acc_stderr": 0.010316062787590011 - }, - "boolq": { - "acc": 0.6030581039755352, - "acc_stderr": 0.008557276964675146 - }, - "arc_easy": { - "acc": 0.6321548821548821, - "acc_stderr": 0.00989492346445519, - "acc_norm": 0.6123737373737373, - "acc_norm_stderr": 0.009997307914447612 - }, - "arc_challenge": { - "acc": 0.29948805460750855, - "acc_stderr": 0.013385021637313562, - "acc_norm": 0.31569965870307165, - "acc_norm_stderr": 0.013582571095815291 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.008823426366942314, - "acc_norm": 0.909, - "acc_norm_stderr": 0.009099549538400241 - }, - "piqa": { - "acc": 0.7611534276387377, - "acc_stderr": 0.0099481203853375, - "acc_norm": 0.7611534276387377, - "acc_norm_stderr": 0.009948120385337484 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..ef0bab965e1cb6dee884ef31574788d06fed3f2f --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.328,0.014853842487270334,0 +anli_r2,acc,0.355,0.015139491543780536,0 +anli_r3,acc,0.35083333333333333,0.013782212417178199,0 +arc_challenge,acc,0.2977815699658703,0.013363080107244489,0 +arc_challenge,acc_norm,0.3046075085324232,0.013449522109932487,0 +arc_easy,acc,0.6355218855218855,0.00987572928248244,0 +arc_easy,acc_norm,0.6111111111111112,0.01000324833531377,0 +boolq,acc,0.6159021406727829,0.008506861063860244,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.26794871794871794,,1 +copa,acc,0.83,0.037752516806863715,0 +hellaswag,acc,0.4726150169288986,0.004982291744069915,0 +hellaswag,acc_norm,0.633240390360486,0.004809352075008949,0 +piqa,acc,0.750272034820457,0.010099232969867486,0 +piqa,acc_norm,0.7671381936887922,0.009861236071080751,0 +rte,acc,0.4729241877256318,0.030052303463143706,0 +sciq,acc,0.915,0.008823426366942324,0 +sciq,acc_norm,0.917,0.008728527206074787,0 +storycloze_2016,acc,0.7279529663281668,0.010290888060871242,0 +winogrande,acc,0.5911602209944752,0.013816954295135696,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json deleted file mode 100644 index 85e7ed8057e91a489479bda37b70a8bba463b6c6..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_4_lm-eval_global_step80108_2023-02-22-18-53-17_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.328, - "acc_stderr": 0.014853842487270334 - }, - "anli_r2": { - "acc": 0.355, - "acc_stderr": 0.015139491543780536 - }, - "anli_r3": { - "acc": 0.35083333333333333, - "acc_stderr": 0.013782212417178199 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.26794871794871794 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.037752516806863715 - }, - "hellaswag": { - "acc": 0.4726150169288986, - "acc_stderr": 0.004982291744069915, - "acc_norm": 0.633240390360486, - "acc_norm_stderr": 0.004809352075008949 - }, - "rte": { - "acc": 0.4729241877256318, - "acc_stderr": 0.030052303463143706 - }, - "winogrande": { - "acc": 0.5911602209944752, - "acc_stderr": 0.013816954295135696 - }, - "storycloze_2016": { - "acc": 0.7279529663281668, - "acc_stderr": 0.010290888060871242 - }, - "boolq": { - "acc": 0.6159021406727829, - "acc_stderr": 0.008506861063860244 - }, - "arc_easy": { - "acc": 0.6355218855218855, - "acc_stderr": 0.00987572928248244, - "acc_norm": 0.6111111111111112, - "acc_norm_stderr": 0.01000324833531377 - }, - "arc_challenge": { - "acc": 0.2977815699658703, - "acc_stderr": 0.013363080107244489, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.013449522109932487 - }, - "sciq": { - "acc": 0.915, - "acc_stderr": 0.008823426366942324, - "acc_norm": 0.917, - "acc_norm_stderr": 0.008728527206074787 - }, - "piqa": { - "acc": 0.750272034820457, - "acc_stderr": 0.010099232969867486, - "acc_norm": 0.7671381936887922, - "acc_norm_stderr": 0.009861236071080751 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5.csv b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..e1ae254f735e075571a433b6faef11c4a54a02ce --- /dev/null +++ b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.341,0.014998131348402695,0 +anli_r2,acc,0.348,0.015070604603768408,0 +anli_r3,acc,0.3408333333333333,0.01368860079329693,0 +arc_challenge,acc,0.3037542662116041,0.01343890918477875,0 +arc_challenge,acc_norm,0.3191126279863481,0.0136216961191733,0 +arc_easy,acc,0.6397306397306397,0.009851002584732383,0 +arc_easy,acc_norm,0.6157407407407407,0.009981120724601436,0 +boolq,acc,0.6116207951070336,0.008524357307908785,1 +cb,acc,0.5,0.06741998624632421,1 +cb,f1,0.2506410256410256,,1 +copa,acc,0.83,0.03775251680686371,0 +hellaswag,acc,0.4731129257120096,0.004982561815214125,0 +hellaswag,acc_norm,0.6352320254929297,0.004803812631994968,0 +piqa,acc,0.749183895538629,0.010113869547069044,0 +piqa,acc_norm,0.7600652883569097,0.009963625892809545,0 +rte,acc,0.49458483754512633,0.030094698123239966,0 +sciq,acc,0.921,0.008534156773333438,0 +sciq,acc_norm,0.916,0.008776162089491132,0 +storycloze_2016,acc,0.729021913415286,0.010278188399635048,0 +winogrande,acc,0.595895816890292,0.01379161066467086,0 diff --git a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json b/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json deleted file mode 100644 index e80643c497e39ed0e9882780d370b285d82b3e1d..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed3/evaluation/rankeval/4b284b84bc4v2seed3_5_lm-eval_global_step80108_2023-02-22-18-53-17_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.341, - "acc_stderr": 0.014998131348402695 - }, - "anli_r2": { - "acc": 0.348, - "acc_stderr": 0.015070604603768408 - }, - "anli_r3": { - "acc": 0.3408333333333333, - "acc_stderr": 0.01368860079329693 - }, - "cb": { - "acc": 0.5, - "acc_stderr": 0.06741998624632421, - "f1": 0.2506410256410256 - }, - "copa": { - "acc": 0.83, - "acc_stderr": 0.03775251680686371 - }, - "hellaswag": { - "acc": 0.4731129257120096, - "acc_stderr": 0.004982561815214125, - "acc_norm": 0.6352320254929297, - "acc_norm_stderr": 0.004803812631994968 - }, - "rte": { - "acc": 0.49458483754512633, - "acc_stderr": 0.030094698123239966 - }, - "winogrande": { - "acc": 0.595895816890292, - "acc_stderr": 0.01379161066467086 - }, - "storycloze_2016": { - "acc": 0.729021913415286, - "acc_stderr": 0.010278188399635048 - }, - "boolq": { - "acc": 0.6116207951070336, - "acc_stderr": 0.008524357307908785 - }, - "arc_easy": { - "acc": 0.6397306397306397, - "acc_stderr": 0.009851002584732383, - "acc_norm": 0.6157407407407407, - "acc_norm_stderr": 0.009981120724601436 - }, - "arc_challenge": { - "acc": 0.3037542662116041, - "acc_stderr": 0.01343890918477875, - "acc_norm": 0.3191126279863481, - "acc_norm_stderr": 0.0136216961191733 - }, - "sciq": { - "acc": 0.921, - "acc_stderr": 0.008534156773333438, - "acc_norm": 0.916, - "acc_norm_stderr": 0.008776162089491132 - }, - "piqa": { - "acc": 0.749183895538629, - "acc_stderr": 0.010113869547069044, - "acc_norm": 0.7600652883569097, - "acc_norm_stderr": 0.009963625892809545 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/generation/merged.csv b/4b284b84bc4v2seed4/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..86a157c32bb109086e8cafc3e4afd2df2c9b92e1 --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0010842906651617643 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0010842906651617643 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20198550157867662 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.20198550157867662 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22608889723213235 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22608889723213235 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23062310394414082 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23062310394414082 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23112566064185652 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23112566064185652 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.22596116871891767 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.22596116871891767 +e2e_nlg_cleaned,5,average,multiple,0.18614477046348096 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.048528921002958546 +gem_xsum,0,median,rouge2_fmeasure,0.048528921002958546 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03812763380493433 +gem_xsum,1,median,rouge2_fmeasure,0.03812763380493433 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.038746331277995295 +gem_xsum,2,median,rouge2_fmeasure,0.038746331277995295 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03502142308268661 +gem_xsum,3,median,rouge2_fmeasure,0.03502142308268661 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.009137318621937963 +gem_xsum,4,median,rouge2_fmeasure,0.009137318621937963 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00025593065551168283 +gem_xsum,5,median,rouge2_fmeasure,0.00025593065551168283 +gem_xsum,5,average,multiple,0.028302926407670737 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0506136487550108 +web_nlg_en,0,median,rouge2_fmeasure,0.0506136487550108 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.054731274015692305 +web_nlg_en,1,median,rouge2_fmeasure,0.054731274015692305 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05717657732704682 +web_nlg_en,2,median,rouge2_fmeasure,0.05717657732704682 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.058284289461857604 +web_nlg_en,3,median,rouge2_fmeasure,0.058284289461857604 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05693950723216792 +web_nlg_en,4,median,rouge2_fmeasure,0.05693950723216792 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05758469671440457 +web_nlg_en,5,median,rouge2_fmeasure,0.05758469671440457 +web_nlg_en,5,average,multiple,0.05588833225103 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03586670200965267 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03586670200965267 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.051942458861991954 +wiki_lingua_en,1,median,rouge2_fmeasure,0.051942458861991954 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.051520411225961356 +wiki_lingua_en,2,median,rouge2_fmeasure,0.051520411225961356 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04463749924381735 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04463749924381735 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014130696002074776 +wiki_lingua_en,4,median,rouge2_fmeasure,0.014130696002074776 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020440057618351945 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0020440057618351945 +wiki_lingua_en,5,average,multiple,0.03335696218422222 diff --git a/4b284b84bc4v2seed4/evaluation/generation/merged.json b/4b284b84bc4v2seed4/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..ed648aaedc4ec3bc4c3c8194a0c6344192a6e9da --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.34441376096478077, "bleu_stderr": 0.03364738239161352, "rouge1_fmeasure": 0.10791921496577737, "rouge1_fmeasure_stderr": 0.002171402381627088, "rouge1_precision": 0.07156437671140904, "rouge1_precision_stderr": 0.0016916866568706518, "rouge1_recall": 0.29498663582371476, "rouge1_recall_stderr": 0.0047504849601230975, "rouge2_fmeasure": 0.0506136487550108, "rouge2_fmeasure_stderr": 0.0013544155568851314, "rouge2_precision": 0.03339629860790558, "rouge2_precision_stderr": 0.0010276756756951227, "rouge2_recall": 0.14089640762105182, "rouge2_recall_stderr": 0.003169219645311096, "rougeL_fmeasure": 0.10397564759824128, "rougeL_fmeasure_stderr": 0.002024130402320653, "rougeL_precision": 0.06867849616024986, "rougeL_precision_stderr": 0.0015588261952769657, "rougeL_recall": 0.2865361097530141, "rougeL_recall_stderr": 0.004609475551865619, "rougeLsum_fmeasure": 0.10284733675284609, "rougeLsum_fmeasure_stderr": 0.0020471999326618765, "rougeLsum_precision": 0.06816360012566612, "rougeLsum_precision_stderr": 0.001597766711115546, "rougeLsum_recall": 0.2811330280761629, "rougeLsum_recall_stderr": 0.004465952639791254}}, "1": {"PALM_prompt": {"bleu": 0.4687199126816133, "bleu_stderr": 0.036800374006492664, "rouge1_fmeasure": 0.11753553291473576, "rouge1_fmeasure_stderr": 0.0019140656759965315, "rouge1_precision": 0.07570221536151728, "rouge1_precision_stderr": 0.0014363865463457549, "rouge1_recall": 0.37281196342463774, "rouge1_recall_stderr": 0.005425026995807581, "rouge2_fmeasure": 0.054731274015692305, "rouge2_fmeasure_stderr": 0.0011992442792975787, "rouge2_precision": 0.03517849994399333, "rouge2_precision_stderr": 0.0008874020071430252, "rouge2_recall": 0.18207060381703669, "rouge2_recall_stderr": 0.003664740082406787, "rougeL_fmeasure": 0.11105152882900622, "rougeL_fmeasure_stderr": 0.0017516497637055164, "rougeL_precision": 0.07143954952407837, "rougeL_precision_stderr": 0.001305238902060498, "rougeL_recall": 0.35175875189308164, "rougeL_recall_stderr": 0.0049642525171124975, "rougeLsum_fmeasure": 0.11134944889691471, "rougeLsum_fmeasure_stderr": 0.001795094595999485, "rougeLsum_precision": 0.07178061720744913, "rougeLsum_precision_stderr": 0.0013540467985344866, "rougeLsum_recall": 0.35171330673154494, "rougeLsum_recall_stderr": 0.004937137016938597}}, "2": {"PALM_prompt": {"bleu": 0.5235717824828133, "bleu_stderr": 0.03767810577658163, "rouge1_fmeasure": 0.12260704886786168, "rouge1_fmeasure_stderr": 0.0018070929383927695, "rouge1_precision": 0.0783700606217331, "rouge1_precision_stderr": 0.0013622939044074152, "rouge1_recall": 0.39995054593345924, "rouge1_recall_stderr": 0.0052804057805626164, "rouge2_fmeasure": 0.05717657732704682, "rouge2_fmeasure_stderr": 0.001147470465886568, "rouge2_precision": 0.03627187434475958, "rouge2_precision_stderr": 0.0008236125331839457, "rouge2_recall": 0.19905166470669564, "rouge2_recall_stderr": 0.0038388111983134595, "rougeL_fmeasure": 0.11517678618925936, "rougeL_fmeasure_stderr": 0.0016478650517612742, "rougeL_precision": 0.07359908440403741, "rougeL_precision_stderr": 0.001237832140736892, "rougeL_recall": 0.37356398478304126, "rougeL_recall_stderr": 0.004770085055809505, "rougeLsum_fmeasure": 0.1165772048673089, "rougeLsum_fmeasure_stderr": 0.0016980264714204242, "rougeLsum_precision": 0.07453785183849788, "rougeLsum_precision_stderr": 0.0012849507538737566, "rougeLsum_recall": 0.3794084560485554, "rougeLsum_recall_stderr": 0.0049017847382029295}}, "3": {"PALM_prompt": {"bleu": 0.5916343641542043, "bleu_stderr": 0.04024225062945221, "rouge1_fmeasure": 0.12387093846990074, "rouge1_fmeasure_stderr": 0.001885726927501542, "rouge1_precision": 0.07888158387802857, "rouge1_precision_stderr": 0.0013905334633539245, "rouge1_recall": 0.4033319142330156, "rouge1_recall_stderr": 0.005299089005220485, "rouge2_fmeasure": 0.058284289461857604, "rouge2_fmeasure_stderr": 0.0012101468635650288, "rouge2_precision": 0.0369653886039744, "rouge2_precision_stderr": 0.0008664032303747072, "rouge2_recall": 0.20236135540846556, "rouge2_recall_stderr": 0.0038733010690355342, "rougeL_fmeasure": 0.11544325899200131, "rougeL_fmeasure_stderr": 0.0016990426527488723, "rougeL_precision": 0.073511276121497, "rougeL_precision_stderr": 0.001251002353845416, "rougeL_recall": 0.37511361165748164, "rougeL_recall_stderr": 0.004789107031378535, "rougeLsum_fmeasure": 0.11762103770674365, "rougeLsum_fmeasure_stderr": 0.0017726486791392565, "rougeLsum_precision": 0.07497143356596919, "rougeLsum_precision_stderr": 0.0013133945848135208, "rougeLsum_recall": 0.3819932847239564, "rougeLsum_recall_stderr": 0.004902385390105475}}, "4": {"PALM_prompt": {"bleu": 0.584148826341706, "bleu_stderr": 0.032088259094079065, "rouge1_fmeasure": 0.12197092014603778, "rouge1_fmeasure_stderr": 0.0017607850151334767, "rouge1_precision": 0.07737100502750222, "rouge1_precision_stderr": 0.0013014310515114744, "rouge1_recall": 0.4039418841652104, "rouge1_recall_stderr": 0.00530796073731901, "rouge2_fmeasure": 0.05693950723216792, "rouge2_fmeasure_stderr": 0.0011236415767176787, "rouge2_precision": 0.035917415755917156, "rouge2_precision_stderr": 0.0008009838171179654, "rouge2_recall": 0.20296997665907882, "rouge2_recall_stderr": 0.0038561197119444304, "rougeL_fmeasure": 0.11390315132939448, "rougeL_fmeasure_stderr": 0.0015842511141082533, "rougeL_precision": 0.07227090942349099, "rougeL_precision_stderr": 0.001170080155549711, "rougeL_recall": 0.37605665528150317, "rougeL_recall_stderr": 0.004782196417125406, "rougeLsum_fmeasure": 0.11605708607515382, "rougeLsum_fmeasure_stderr": 0.001657698456603933, "rougeLsum_precision": 0.07366718744107867, "rougeLsum_precision_stderr": 0.001228472532147223, "rougeLsum_recall": 0.3838102958885374, "rougeLsum_recall_stderr": 0.004940246508662335}}, "5": {"PALM_prompt": {"bleu": 0.643769416521065, "bleu_stderr": 0.04327961614658416, "rouge1_fmeasure": 0.12362474434882809, "rouge1_fmeasure_stderr": 0.0017387117939925156, "rouge1_precision": 0.0778714729225077, "rouge1_precision_stderr": 0.0012716812804740509, "rouge1_recall": 0.4199879645762265, "rouge1_recall_stderr": 0.00535881210434279, "rouge2_fmeasure": 0.05758469671440457, "rouge2_fmeasure_stderr": 0.0011051399672881632, "rouge2_precision": 0.036051220083070316, "rouge2_precision_stderr": 0.000777501535408198, "rouge2_recall": 0.21150748306290545, "rouge2_recall_stderr": 0.0039302718094465635, "rougeL_fmeasure": 0.11417482723461402, "rougeL_fmeasure_stderr": 0.0015386798608318928, "rougeL_precision": 0.0719478472042638, "rougeL_precision_stderr": 0.0011295516496439356, "rougeL_recall": 0.38653994641960504, "rougeL_recall_stderr": 0.004712427125192832, "rougeLsum_fmeasure": 0.1169653954794077, "rougeLsum_fmeasure_stderr": 0.0016283607844766808, "rougeLsum_precision": 0.0737393959127114, "rougeLsum_precision_stderr": 0.001196371537872998, "rougeLsum_recall": 0.39623787521806264, "rougeLsum_recall_stderr": 0.004912468268398718}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5029930497312527, "bleu_stderr": 0.050686861917854, "rouge1_fmeasure": 0.1799962602325878, "rouge1_fmeasure_stderr": 0.00184248350208152, "rouge1_precision": 0.1537922187888957, "rouge1_precision_stderr": 0.0018933213102104863, "rouge1_recall": 0.2625407961087503, "rouge1_recall_stderr": 0.002682894905994936, "rouge2_fmeasure": 0.03586670200965267, "rouge2_fmeasure_stderr": 0.0008450513639399489, "rouge2_precision": 0.030157104540943284, "rouge2_precision_stderr": 0.0007476727834062231, "rouge2_recall": 0.055001050533522555, "rouge2_recall_stderr": 0.0014448444524840297, "rougeL_fmeasure": 0.13967379634609953, "rougeL_fmeasure_stderr": 0.0013162078681435803, "rougeL_precision": 0.11780986849918428, "rougeL_precision_stderr": 0.0013162517308047393, "rougeL_recall": 0.20916963040579825, "rougeL_recall_stderr": 0.002196518290796577, "rougeLsum_fmeasure": 0.16507864218420232, "rougeLsum_fmeasure_stderr": 0.0016765101940476469, "rougeLsum_precision": 0.14081205310262065, "rougeLsum_precision_stderr": 0.0017218252583982084, "rougeLsum_recall": 0.2419875104700217, "rougeLsum_recall_stderr": 0.002500846878935716}}, "1": {"tldr_en": {"bleu": 2.7242641167777077, "bleu_stderr": 0.07938213037481555, "rouge1_fmeasure": 0.21295884672876816, "rouge1_fmeasure_stderr": 0.001991739006687224, "rouge1_precision": 0.18419539466594548, "rouge1_precision_stderr": 0.00216315844926582, "rouge1_recall": 0.30850131448570595, "rouge1_recall_stderr": 0.002881646502827461, "rouge2_fmeasure": 0.051942458861991954, "rouge2_fmeasure_stderr": 0.0010199515581882507, "rouge2_precision": 0.044713693466448406, "rouge2_precision_stderr": 0.000968138913860097, "rouge2_recall": 0.07839796746781187, "rouge2_recall_stderr": 0.0017417848523259565, "rougeL_fmeasure": 0.15484936057383927, "rougeL_fmeasure_stderr": 0.001347730246639345, "rougeL_precision": 0.1326981808478002, "rougeL_precision_stderr": 0.0014579507711624912, "rougeL_recall": 0.23011492408104228, "rougeL_recall_stderr": 0.00229754097749749, "rougeLsum_fmeasure": 0.1992303563413742, "rougeLsum_fmeasure_stderr": 0.0018619630110637574, "rougeLsum_precision": 0.17205251851388254, "rougeLsum_precision_stderr": 0.0020202953529773735, "rougeLsum_recall": 0.28974852616388547, "rougeLsum_recall_stderr": 0.002737913509276009}}, "2": {"tldr_en": {"bleu": 2.86942295772908, "bleu_stderr": 0.08758910211106738, "rouge1_fmeasure": 0.20703905721090818, "rouge1_fmeasure_stderr": 0.001929567362026057, "rouge1_precision": 0.1997095751146021, "rouge1_precision_stderr": 0.002703732640041402, "rouge1_recall": 0.2883485784255837, "rouge1_recall_stderr": 0.002841589718801568, "rouge2_fmeasure": 0.051520411225961356, "rouge2_fmeasure_stderr": 0.001031129051354628, "rouge2_precision": 0.052135221349108814, "rouge2_precision_stderr": 0.0014902691345396325, "rouge2_recall": 0.07402543775497225, "rouge2_recall_stderr": 0.001656124634067446, "rougeL_fmeasure": 0.15441401199805835, "rougeL_fmeasure_stderr": 0.0013920098120515118, "rougeL_precision": 0.1503544839184264, "rougeL_precision_stderr": 0.002191707211555614, "rougeL_recall": 0.21866995964876715, "rougeL_recall_stderr": 0.0022872119612051362, "rougeLsum_fmeasure": 0.1949510570568441, "rougeLsum_fmeasure_stderr": 0.001804660306034504, "rougeLsum_precision": 0.18828746764086687, "rougeLsum_precision_stderr": 0.002574176036533358, "rougeLsum_recall": 0.27233360792883826, "rougeLsum_recall_stderr": 0.0027083132609021184}}, "3": {"tldr_en": {"bleu": 3.0330716712469843, "bleu_stderr": 0.09425863264675978, "rouge1_fmeasure": 0.17391868610816388, "rouge1_fmeasure_stderr": 0.002277407569748311, "rouge1_precision": 0.19702312634415606, "rouge1_precision_stderr": 0.0035795334816332027, "rouge1_recall": 0.22825422304548265, "rouge1_recall_stderr": 0.003232347523636064, "rouge2_fmeasure": 0.04463749924381735, "rouge2_fmeasure_stderr": 0.0010686000964725524, "rouge2_precision": 0.05667983844146617, "rouge2_precision_stderr": 0.002120538883847141, "rouge2_recall": 0.059228247136442105, "rouge2_recall_stderr": 0.0015413764801780459, "rougeL_fmeasure": 0.12950729491278481, "rougeL_fmeasure_stderr": 0.00167294588483517, "rougeL_precision": 0.15076982480282128, "rougeL_precision_stderr": 0.0030098254531581766, "rougeL_recall": 0.17209850822866807, "rougeL_recall_stderr": 0.0025209050965990243, "rougeLsum_fmeasure": 0.1642422006919473, "rougeLsum_fmeasure_stderr": 0.002147002697595944, "rougeLsum_precision": 0.1860026881384513, "rougeLsum_precision_stderr": 0.003405588840643614, "rougeLsum_recall": 0.21613806050396925, "rougeLsum_recall_stderr": 0.0030779584306118175}}, "4": {"tldr_en": {"bleu": 0.4039582112595248, "bleu_stderr": 0.03484552558450677, "rouge1_fmeasure": 0.05441509889844838, "rouge1_fmeasure_stderr": 0.001922155872957772, "rouge1_precision": 0.06862907083042237, "rouge1_precision_stderr": 0.0029137045746235116, "rouge1_recall": 0.07213602119470613, "rouge1_recall_stderr": 0.002675324630242608, "rouge2_fmeasure": 0.014130696002074776, "rouge2_fmeasure_stderr": 0.0007469700039793996, "rouge2_precision": 0.019800771477798358, "rouge2_precision_stderr": 0.0014215056199837948, "rouge2_recall": 0.019407609285097115, "rouge2_recall_stderr": 0.0011737580125973422, "rougeL_fmeasure": 0.04132917986525425, "rougeL_fmeasure_stderr": 0.001453754043869357, "rougeL_precision": 0.054030861033230786, "rougeL_precision_stderr": 0.002428473990666629, "rougeL_recall": 0.055369844370012286, "rougeL_recall_stderr": 0.0020897273452589395, "rougeLsum_fmeasure": 0.051839091609912694, "rougeLsum_fmeasure_stderr": 0.00182789753930337, "rougeLsum_precision": 0.06581471030428865, "rougeLsum_precision_stderr": 0.0028148588567455577, "rougeLsum_recall": 0.06855419478068313, "rougeLsum_recall_stderr": 0.002540543323104449}}, "5": {"tldr_en": {"bleu": 2.0014137810446263e-08, "bleu_stderr": 5.022609361040328e-08, "rouge1_fmeasure": 0.008187248278912505, "rouge1_fmeasure_stderr": 0.0008215139146136158, "rouge1_precision": 0.010184074003326236, "rouge1_precision_stderr": 0.001185280919251267, "rouge1_recall": 0.010530222683810939, "rouge1_recall_stderr": 0.0010849872451804167, "rouge2_fmeasure": 0.0020440057618351945, "rouge2_fmeasure_stderr": 0.0002894852867481284, "rouge2_precision": 0.003045190089865129, "rouge2_precision_stderr": 0.000629256262446744, "rouge2_recall": 0.0025316582919952747, "rouge2_recall_stderr": 0.00035954782149502667, "rougeL_fmeasure": 0.006141492653296614, "rougeL_fmeasure_stderr": 0.000619473069682991, "rougeL_precision": 0.00801661098513702, "rougeL_precision_stderr": 0.0009926654487693505, "rougeL_recall": 0.007789272405875561, "rougeL_recall_stderr": 0.0008037973390293996, "rougeLsum_fmeasure": 0.007792108172262034, "rougeLsum_fmeasure_stderr": 0.0007861479859061975, "rougeLsum_precision": 0.009784563527498156, "rougeLsum_precision_stderr": 0.0011539456656877766, "rougeLsum_recall": 0.009966706597956685, "rougeLsum_recall_stderr": 0.001029454113884746}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.1668198320527802, "bleu_stderr": 0.01897983257333898, "rouge1_fmeasure": 0.022506347386715704, "rouge1_fmeasure_stderr": 0.0005868062207608241, "rouge1_precision": 0.030286876271392194, "rouge1_precision_stderr": 0.001036156882791465, "rouge1_recall": 0.027143500611213456, "rouge1_recall_stderr": 0.0007465539304065285, "rouge2_fmeasure": 0.0010842906651617643, "rouge2_fmeasure_stderr": 0.0001636612072099844, "rouge2_precision": 0.0011223500556642703, "rouge2_precision_stderr": 0.0002013011694364979, "rouge2_recall": 0.0015238484363635906, "rouge2_recall_stderr": 0.00022622383493949605, "rougeL_fmeasure": 0.022348380076958108, "rougeL_fmeasure_stderr": 0.0005776004107689916, "rougeL_precision": 0.029891451434790904, "rougeL_precision_stderr": 0.0010048119314992631, "rougeL_recall": 0.027040559559811043, "rougeL_recall_stderr": 0.0007436210949765598, "rougeLsum_fmeasure": 0.02171066987180087, "rougeLsum_fmeasure_stderr": 0.0005513649905399339, "rougeLsum_precision": 0.02960870450534663, "rougeLsum_precision_stderr": 0.0010175867142908736, "rougeLsum_recall": 0.025998503793795025, "rougeLsum_recall_stderr": 0.0006867190478188373}}, "1": {"generate_text_restaurant": {"bleu": 11.497789088857585, "bleu_stderr": 0.1207333221383202, "rouge1_fmeasure": 0.437040451945744, "rouge1_fmeasure_stderr": 0.002333774145212166, "rouge1_precision": 0.5193372726053079, "rouge1_precision_stderr": 0.0032827860366154518, "rouge1_recall": 0.4187417243003722, "rouge1_recall_stderr": 0.0029601462557396806, "rouge2_fmeasure": 0.20198550157867662, "rouge2_fmeasure_stderr": 0.0019395848816704526, "rouge2_precision": 0.24404637097851045, "rouge2_precision_stderr": 0.0025862116613527076, "rouge2_recall": 0.1933021952228981, "rouge2_recall_stderr": 0.002111531045223336, "rougeL_fmeasure": 0.31702593617702457, "rougeL_fmeasure_stderr": 0.0020123220115816466, "rougeL_precision": 0.3793149671052253, "rougeL_precision_stderr": 0.0028788724723207175, "rougeL_recall": 0.303084323993645, "rougeL_recall_stderr": 0.0023997981815417927, "rougeLsum_fmeasure": 0.3567272344346471, "rougeLsum_fmeasure_stderr": 0.002273204999670106, "rougeLsum_precision": 0.424766622860286, "rougeLsum_precision_stderr": 0.0031065633020390844, "rougeLsum_recall": 0.34158689328494163, "rougeLsum_recall_stderr": 0.0027234093565780253}}, "2": {"generate_text_restaurant": {"bleu": 13.232352061213327, "bleu_stderr": 0.18438572640282935, "rouge1_fmeasure": 0.46686827804183195, "rouge1_fmeasure_stderr": 0.0021857893913537927, "rouge1_precision": 0.5438295687735817, "rouge1_precision_stderr": 0.0032626249125624544, "rouge1_recall": 0.4519156735724399, "rouge1_recall_stderr": 0.0028471232903033885, "rouge2_fmeasure": 0.22608889723213235, "rouge2_fmeasure_stderr": 0.001981518091728108, "rouge2_precision": 0.2677592832314449, "rouge2_precision_stderr": 0.002678442807946661, "rouge2_recall": 0.21857181845743923, "rouge2_recall_stderr": 0.0021530696594196743, "rougeL_fmeasure": 0.34083378350431404, "rougeL_fmeasure_stderr": 0.0020309168195614307, "rougeL_precision": 0.3988680585181114, "rougeL_precision_stderr": 0.0029405304248634437, "rougeL_recall": 0.3293160518559301, "rougeL_recall_stderr": 0.002411675944122487, "rougeLsum_fmeasure": 0.38531835059376945, "rougeLsum_fmeasure_stderr": 0.0022236074600925494, "rougeLsum_precision": 0.44900069330118875, "rougeLsum_precision_stderr": 0.0031269512646810997, "rougeLsum_recall": 0.3728742942560764, "rougeLsum_recall_stderr": 0.002675120241181924}}, "3": {"generate_text_restaurant": {"bleu": 14.1599321841023, "bleu_stderr": 0.1879116618302546, "rouge1_fmeasure": 0.4695262322349431, "rouge1_fmeasure_stderr": 0.00215490562453037, "rouge1_precision": 0.5316343436240858, "rouge1_precision_stderr": 0.0032625643740183926, "rouge1_recall": 0.4648983433320942, "rouge1_recall_stderr": 0.002828266123213244, "rouge2_fmeasure": 0.23062310394414082, "rouge2_fmeasure_stderr": 0.0019690665088102194, "rouge2_precision": 0.26457242284963084, "rouge2_precision_stderr": 0.0026154624845308966, "rouge2_recall": 0.2286713763984792, "rouge2_recall_stderr": 0.002213558974457314, "rougeL_fmeasure": 0.3421096115443624, "rougeL_fmeasure_stderr": 0.0020310235455138293, "rougeL_precision": 0.3891085104737073, "rougeL_precision_stderr": 0.0029515533104354013, "rougeL_recall": 0.3385179926519356, "rougeL_recall_stderr": 0.002447035400467983, "rougeLsum_fmeasure": 0.3897046712016254, "rougeLsum_fmeasure_stderr": 0.0022104074028312126, "rougeLsum_precision": 0.44153945634327574, "rougeLsum_precision_stderr": 0.003131733329038247, "rougeLsum_recall": 0.3859861563616844, "rougeLsum_recall_stderr": 0.0027050660850819557}}, "4": {"generate_text_restaurant": {"bleu": 14.215875217167579, "bleu_stderr": 0.19854834212398645, "rouge1_fmeasure": 0.46760558865318314, "rouge1_fmeasure_stderr": 0.0021274829390602406, "rouge1_precision": 0.5190910100984744, "rouge1_precision_stderr": 0.003294663292154946, "rouge1_recall": 0.47129632991643877, "rouge1_recall_stderr": 0.002752331164331658, "rouge2_fmeasure": 0.23112566064185652, "rouge2_fmeasure_stderr": 0.001973386702544114, "rouge2_precision": 0.2607371623810947, "rouge2_precision_stderr": 0.0026626721914091273, "rouge2_recall": 0.23271303570323362, "rouge2_recall_stderr": 0.002186914450625737, "rougeL_fmeasure": 0.33922485306684963, "rougeL_fmeasure_stderr": 0.0020379531080847698, "rougeL_precision": 0.37743317581253305, "rougeL_precision_stderr": 0.002922501161012456, "rougeL_recall": 0.3421754841859648, "rougeL_recall_stderr": 0.002448897823883138, "rougeLsum_fmeasure": 0.39088870724203995, "rougeLsum_fmeasure_stderr": 0.0022061435555151025, "rougeLsum_precision": 0.43332497676504134, "rougeLsum_precision_stderr": 0.0031201843719093183, "rougeLsum_recall": 0.39450016226821116, "rougeLsum_recall_stderr": 0.002683555554853958}}, "5": {"generate_text_restaurant": {"bleu": 13.675372357471726, "bleu_stderr": 0.1772679235890945, "rouge1_fmeasure": 0.4652741436608286, "rouge1_fmeasure_stderr": 0.0020856290234426084, "rouge1_precision": 0.5079146856634101, "rouge1_precision_stderr": 0.003244631875951903, "rouge1_recall": 0.4750602535272041, "rouge1_recall_stderr": 0.002707456752563758, "rouge2_fmeasure": 0.22596116871891767, "rouge2_fmeasure_stderr": 0.0019272220902171223, "rouge2_precision": 0.2506185278912072, "rouge2_precision_stderr": 0.0025688729450509744, "rouge2_recall": 0.23020274907523836, "rouge2_recall_stderr": 0.0021319283224229416, "rougeL_fmeasure": 0.3336418614264682, "rougeL_fmeasure_stderr": 0.0020082018536239574, "rougeL_precision": 0.3646830222320307, "rougeL_precision_stderr": 0.002836137970051408, "rougeL_recall": 0.34124704138322626, "rougeL_recall_stderr": 0.002433024945476074, "rougeLsum_fmeasure": 0.38639018088252236, "rougeLsum_fmeasure_stderr": 0.0021588490021418913, "rougeLsum_precision": 0.42108375995240394, "rougeLsum_precision_stderr": 0.003034856921594324, "rougeLsum_recall": 0.3952453063898795, "rougeLsum_recall_stderr": 0.0026571910552197976}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.9965727451782298, "bleu_stderr": 0.15826205801047227, "rouge1_fmeasure": 0.2181226869116415, "rouge1_fmeasure_stderr": 0.0027581474749694035, "rouge1_precision": 0.17780010896090837, "rouge1_precision_stderr": 0.0028657687702784394, "rouge1_recall": 0.33168127958194993, "rouge1_recall_stderr": 0.0043909919411222464, "rouge2_fmeasure": 0.048528921002958546, "rouge2_fmeasure_stderr": 0.0017764487817816764, "rouge2_precision": 0.03931279207892139, "rouge2_precision_stderr": 0.0016581578355807393, "rouge2_recall": 0.07619599508022841, "rouge2_recall_stderr": 0.00272241500352495, "rougeL_fmeasure": 0.16205491808395142, "rougeL_fmeasure_stderr": 0.002184657948901955, "rougeL_precision": 0.13228242420416667, "rougeL_precision_stderr": 0.0023035178337467654, "rougeL_recall": 0.24740150573619668, "rougeL_recall_stderr": 0.003505699991405706, "rougeLsum_fmeasure": 0.16995115866612748, "rougeLsum_fmeasure_stderr": 0.0023209836007670715, "rougeLsum_precision": 0.13792549177526942, "rougeLsum_precision_stderr": 0.002344912834751097, "rougeLsum_recall": 0.26116272332322926, "rougeLsum_recall_stderr": 0.0038687142611478474}}, "1": {"article_DOC_summary": {"bleu": 1.521961099148944, "bleu_stderr": 0.1067162997839333, "rouge1_fmeasure": 0.17785826511919287, "rouge1_fmeasure_stderr": 0.002506168575309281, "rouge1_precision": 0.12623297679712753, "rouge1_precision_stderr": 0.00185031477682142, "rouge1_recall": 0.31355314687479174, "rouge1_recall_stderr": 0.004381874835308543, "rouge2_fmeasure": 0.03812763380493433, "rouge2_fmeasure_stderr": 0.0014499362846337098, "rouge2_precision": 0.02671739102876372, "rouge2_precision_stderr": 0.0010147304084239857, "rouge2_recall": 0.06963018507899452, "rouge2_recall_stderr": 0.002768236684178756, "rougeL_fmeasure": 0.13961978647843165, "rougeL_fmeasure_stderr": 0.0018876284466631298, "rougeL_precision": 0.0989142119051423, "rougeL_precision_stderr": 0.0013791496324926688, "rougeL_recall": 0.24757016935280626, "rougeL_recall_stderr": 0.0034520987132384117, "rougeLsum_fmeasure": 0.1418358500050043, "rougeLsum_fmeasure_stderr": 0.0020742150121420516, "rougeLsum_precision": 0.10042944590263467, "rougeLsum_precision_stderr": 0.0015100489954681905, "rougeLsum_recall": 0.2518445443289288, "rougeLsum_recall_stderr": 0.003780328500383181}}, "2": {"article_DOC_summary": {"bleu": 1.5280857149419538, "bleu_stderr": 0.07885108475340596, "rouge1_fmeasure": 0.17992305299553213, "rouge1_fmeasure_stderr": 0.002471474032703446, "rouge1_precision": 0.12776888363608563, "rouge1_precision_stderr": 0.0018271531287357235, "rouge1_recall": 0.31621003695806416, "rouge1_recall_stderr": 0.004292592593741099, "rouge2_fmeasure": 0.038746331277995295, "rouge2_fmeasure_stderr": 0.001439750042341812, "rouge2_precision": 0.02713599956564803, "rouge2_precision_stderr": 0.0010066515368428789, "rouge2_recall": 0.07048237066918879, "rouge2_recall_stderr": 0.0027207468576705084, "rougeL_fmeasure": 0.14290468633219328, "rougeL_fmeasure_stderr": 0.0019087949950632504, "rougeL_precision": 0.10123566070632635, "rougeL_precision_stderr": 0.0013895764877661831, "rougeL_recall": 0.2528390510389675, "rougeL_recall_stderr": 0.003483747346972295, "rougeLsum_fmeasure": 0.14276376154853065, "rougeLsum_fmeasure_stderr": 0.002040901548342001, "rougeLsum_precision": 0.10104760152172992, "rougeLsum_precision_stderr": 0.0014792769919128913, "rougeLsum_recall": 0.2530463368650048, "rougeLsum_recall_stderr": 0.0037161332717751}}, "3": {"article_DOC_summary": {"bleu": 1.521537278708307, "bleu_stderr": 0.1340390241992026, "rouge1_fmeasure": 0.16685520709327625, "rouge1_fmeasure_stderr": 0.002665570155443094, "rouge1_precision": 0.12121822408668065, "rouge1_precision_stderr": 0.0020739133661264075, "rouge1_recall": 0.287891057927081, "rouge1_recall_stderr": 0.004579134322254905, "rouge2_fmeasure": 0.03502142308268661, "rouge2_fmeasure_stderr": 0.001453833565879767, "rouge2_precision": 0.024946562796427662, "rouge2_precision_stderr": 0.0010295220175533656, "rouge2_recall": 0.062229435333035196, "rouge2_recall_stderr": 0.0027102677143777706, "rougeL_fmeasure": 0.13219464959176125, "rougeL_fmeasure_stderr": 0.002089625494358884, "rougeL_precision": 0.09584883568666924, "rougeL_precision_stderr": 0.0016019599062301714, "rougeL_recall": 0.22935087983395927, "rougeL_recall_stderr": 0.003723896150091747, "rougeLsum_fmeasure": 0.13360679199458778, "rougeLsum_fmeasure_stderr": 0.0022032590659256113, "rougeLsum_precision": 0.09677327103730465, "rougeLsum_precision_stderr": 0.0016860407775182488, "rougeLsum_recall": 0.23249987999606497, "rougeLsum_recall_stderr": 0.003931934403157173}}, "4": {"article_DOC_summary": {"bleu": 0.629879023989858, "bleu_stderr": 0.12544962320083983, "rouge1_fmeasure": 0.04534730746066439, "rouge1_fmeasure_stderr": 0.002611229049263828, "rouge1_precision": 0.038078096723767996, "rouge1_precision_stderr": 0.0023781047408131164, "rouge1_recall": 0.07110840872162687, "rouge1_recall_stderr": 0.0041655460986093325, "rouge2_fmeasure": 0.009137318621937963, "rouge2_fmeasure_stderr": 0.0009714552620996167, "rouge2_precision": 0.00755130939213845, "rouge2_precision_stderr": 0.0009407309816817791, "rouge2_recall": 0.01456763376419491, "rouge2_recall_stderr": 0.0015059193465168199, "rougeL_fmeasure": 0.036178777224856525, "rougeL_fmeasure_stderr": 0.0020901567682781942, "rougeL_precision": 0.030895255217096195, "rougeL_precision_stderr": 0.002026498228282476, "rougeL_recall": 0.05673417944538178, "rougeL_recall_stderr": 0.0033250538537188097, "rougeLsum_fmeasure": 0.03747044675977351, "rougeLsum_fmeasure_stderr": 0.002195978615372179, "rougeLsum_precision": 0.03193887181863606, "rougeLsum_precision_stderr": 0.002103908155038245, "rougeLsum_recall": 0.05872079534060702, "rougeLsum_recall_stderr": 0.0034877968907706214}}, "5": {"article_DOC_summary": {"bleu": 8.029951400668467e-38, "bleu_stderr": 8.999389473215882e-33, "rouge1_fmeasure": 0.0024480546534426002, "rouge1_fmeasure_stderr": 0.0006663143862102796, "rouge1_precision": 0.0028153785061314713, "rouge1_precision_stderr": 0.0008109014775672004, "rouge1_recall": 0.0022397022323243287, "rouge1_recall_stderr": 0.0005923742999126791, "rouge2_fmeasure": 0.00025593065551168283, "rouge2_fmeasure_stderr": 0.00014085048380669117, "rouge2_precision": 0.00032960010762452495, "rouge2_precision_stderr": 0.0001942158416689748, "rouge2_recall": 0.00021523288032721994, "rouge2_recall_stderr": 0.00011457292554383501, "rougeL_fmeasure": 0.0018728301034077845, "rougeL_fmeasure_stderr": 0.0005184193716142723, "rougeL_precision": 0.0021686356526300445, "rougeL_precision_stderr": 0.0006433045803199352, "rougeL_recall": 0.00171133818409571, "rougeL_recall_stderr": 0.0004578910681243736, "rougeLsum_fmeasure": 0.0021046719353290323, "rougeLsum_fmeasure_stderr": 0.0005793875805849997, "rougeLsum_precision": 0.0024191989997527706, "rougeLsum_precision_stderr": 0.0007056629627390068, "rougeLsum_recall": 0.0019291072387565224, "rougeLsum_recall_stderr": 0.0005158196897264791}}}} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..e0b58f54bc6442918fd81508d0a892f31ad9bd5b --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.323,0.01479492784334864,0 +anli_r2,acc,0.35,0.015090650341444233,0 +anli_r3,acc,0.3441666666666667,0.013720551062295755,0 +arc_challenge,acc,0.2790102389078498,0.013106784883601336,0 +arc_challenge,acc_norm,0.3046075085324232,0.01344952210993249,0 +arc_easy,acc,0.5989057239057239,0.01005705110653437,0 +arc_easy,acc_norm,0.5425084175084175,0.010222638127749496,0 +boolq,acc,0.5886850152905199,0.008606395426309208,1 +cb,acc,0.3392857142857143,0.06384226561930825,1 +cb,f1,0.26343091936312274,,1 +copa,acc,0.72,0.04512608598542127,0 +hellaswag,acc,0.475502887870942,0.004983788992681198,0 +hellaswag,acc_norm,0.6266679944234216,0.004827006520802888,0 +piqa,acc,0.7551686615886833,0.010032309105568793,0 +piqa,acc_norm,0.763873775843308,0.009908965890558216,0 +rte,acc,0.5631768953068592,0.02985524739031495,0 +sciq,acc,0.849,0.011328165223341671,0 +sciq,acc_norm,0.757,0.013569640199177451,0 +storycloze_2016,acc,0.7156600748262961,0.01043161412866525,0 +winogrande,acc,0.5761641673243884,0.013888492389944508,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0_lm-eval_global_step80108_2023-02-24-15-37-25_0shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0_lm-eval_global_step80108_2023-02-24-15-37-25_0shots_backup.json deleted file mode 100644 index b32521d8f01d8f7c607d2551e27e155422831017..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_0_lm-eval_global_step80108_2023-02-24-15-37-25_0shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.323, - "acc_stderr": 0.01479492784334864 - }, - "anli_r2": { - "acc": 0.35, - "acc_stderr": 0.015090650341444233 - }, - "anli_r3": { - "acc": 0.3441666666666667, - "acc_stderr": 0.013720551062295755 - }, - "cb": { - "acc": 0.3392857142857143, - "acc_stderr": 0.06384226561930825, - "f1": 0.26343091936312274 - }, - "copa": { - "acc": 0.72, - "acc_stderr": 0.04512608598542127 - }, - "hellaswag": { - "acc": 0.475502887870942, - "acc_stderr": 0.004983788992681198, - "acc_norm": 0.6266679944234216, - "acc_norm_stderr": 0.004827006520802888 - }, - "rte": { - "acc": 0.5631768953068592, - "acc_stderr": 0.02985524739031495 - }, - "winogrande": { - "acc": 0.5761641673243884, - "acc_stderr": 0.013888492389944508 - }, - "storycloze_2016": { - "acc": 0.7156600748262961, - "acc_stderr": 0.01043161412866525 - }, - "boolq": { - "acc": 0.5886850152905199, - "acc_stderr": 0.008606395426309208 - }, - "arc_easy": { - "acc": 0.5989057239057239, - "acc_stderr": 0.01005705110653437, - "acc_norm": 0.5425084175084175, - "acc_norm_stderr": 0.010222638127749496 - }, - "arc_challenge": { - "acc": 0.2790102389078498, - "acc_stderr": 0.013106784883601336, - "acc_norm": 0.3046075085324232, - "acc_norm_stderr": 0.01344952210993249 - }, - "sciq": { - "acc": 0.849, - "acc_stderr": 0.011328165223341671, - "acc_norm": 0.757, - "acc_norm_stderr": 0.013569640199177451 - }, - "piqa": { - "acc": 0.7551686615886833, - "acc_stderr": 0.010032309105568793, - "acc_norm": 0.763873775843308, - "acc_norm_stderr": 0.009908965890558216 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..dc923bd0dda1ec859acae2ea78b467fdb3c51799 --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.314,0.014683991951087966,0 +anli_r2,acc,0.319,0.014746404865473477,0 +anli_r3,acc,0.3475,0.013751753243291852,0 +arc_challenge,acc,0.29436860068259385,0.013318528460539426,0 +arc_challenge,acc_norm,0.32337883959044367,0.013669421630012132,0 +arc_easy,acc,0.617003367003367,0.009974920384536469,0 +arc_easy,acc_norm,0.5744949494949495,0.010145271182591026,0 +boolq,acc,0.5957186544342508,0.008583313811372076,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.3282195387458545,,1 +copa,acc,0.76,0.04292346959909283,0 +hellaswag,acc,0.47321250746863175,0.004982615233057104,0 +hellaswag,acc_norm,0.6222863971320454,0.004838246410786253,0 +piqa,acc,0.7540805223068553,0.010047331865625191,0 +piqa,acc_norm,0.7584330794341676,0.009986718001804454,0 +rte,acc,0.5487364620938628,0.029953149241808946,0 +sciq,acc,0.884,0.010131468138756995,0 +sciq,acc_norm,0.859,0.011010914595992443,0 +storycloze_2016,acc,0.7049706039551042,0.010546232606962287,0 +winogrande,acc,0.5706393054459353,0.013911537499969179,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1_lm-eval_global_step80108_2023-02-24-15-37-25_1shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1_lm-eval_global_step80108_2023-02-24-15-37-25_1shots_backup.json deleted file mode 100644 index add8df2245b74afd79809099e62b7286cf26af96..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_1_lm-eval_global_step80108_2023-02-24-15-37-25_1shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.314, - "acc_stderr": 0.014683991951087966 - }, - "anli_r2": { - "acc": 0.319, - "acc_stderr": 0.014746404865473477 - }, - "anli_r3": { - "acc": 0.3475, - "acc_stderr": 0.013751753243291852 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.3282195387458545 - }, - "copa": { - "acc": 0.76, - "acc_stderr": 0.04292346959909283 - }, - "hellaswag": { - "acc": 0.47321250746863175, - "acc_stderr": 0.004982615233057104, - "acc_norm": 0.6222863971320454, - "acc_norm_stderr": 0.004838246410786253 - }, - "rte": { - "acc": 0.5487364620938628, - "acc_stderr": 0.029953149241808946 - }, - "winogrande": { - "acc": 0.5706393054459353, - "acc_stderr": 0.013911537499969179 - }, - "storycloze_2016": { - "acc": 0.7049706039551042, - "acc_stderr": 0.010546232606962287 - }, - "boolq": { - "acc": 0.5957186544342508, - "acc_stderr": 0.008583313811372076 - }, - "arc_easy": { - "acc": 0.617003367003367, - "acc_stderr": 0.009974920384536469, - "acc_norm": 0.5744949494949495, - "acc_norm_stderr": 0.010145271182591026 - }, - "arc_challenge": { - "acc": 0.29436860068259385, - "acc_stderr": 0.013318528460539426, - "acc_norm": 0.32337883959044367, - "acc_norm_stderr": 0.013669421630012132 - }, - "sciq": { - "acc": 0.884, - "acc_stderr": 0.010131468138756995, - "acc_norm": 0.859, - "acc_norm_stderr": 0.011010914595992443 - }, - "piqa": { - "acc": 0.7540805223068553, - "acc_stderr": 0.010047331865625191, - "acc_norm": 0.7584330794341676, - "acc_norm_stderr": 0.009986718001804454 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..2904c3ae834cf1ef0c1626c693e2cdb5d17ed09d --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.326,0.014830507204541056,0 +anli_r2,acc,0.321,0.014770821817934644,0 +anli_r3,acc,0.3308333333333333,0.013588208070708997,0 +arc_challenge,acc,0.302901023890785,0.013428241573185349,0 +arc_challenge,acc_norm,0.3174061433447099,0.01360223908803817,0 +arc_easy,acc,0.6191077441077442,0.009964428212260372,0 +arc_easy,acc_norm,0.5921717171717171,0.010083950240041223,0 +boolq,acc,0.6039755351681957,0.008553881336813413,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2692307692307692,,1 +copa,acc,0.75,0.04351941398892446,0 +hellaswag,acc,0.4715196176060546,0.004981680090303695,0 +hellaswag,acc_norm,0.6208922525393348,0.004841734453506668,0 +piqa,acc,0.7524483133841132,0.010069703966857104,0 +piqa,acc_norm,0.7633297062023939,0.009916841655042809,0 +rte,acc,0.48375451263537905,0.030080573208738064,0 +sciq,acc,0.899,0.009533618929340997,0 +sciq,acc_norm,0.877,0.010391293421849877,0 +storycloze_2016,acc,0.7183324425440941,0.010401844358587665,0 +winogrande,acc,0.574585635359116,0.013895257666646382,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2_lm-eval_global_step80108_2023-02-24-15-37-26_2shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2_lm-eval_global_step80108_2023-02-24-15-37-26_2shots_backup.json deleted file mode 100644 index 8e961a9c331db0036f933002c7fc889aaa180731..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_2_lm-eval_global_step80108_2023-02-24-15-37-26_2shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.326, - "acc_stderr": 0.014830507204541056 - }, - "anli_r2": { - "acc": 0.321, - "acc_stderr": 0.014770821817934644 - }, - "anli_r3": { - "acc": 0.3308333333333333, - "acc_stderr": 0.013588208070708997 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.2692307692307692 - }, - "copa": { - "acc": 0.75, - "acc_stderr": 0.04351941398892446 - }, - "hellaswag": { - "acc": 0.4715196176060546, - "acc_stderr": 0.004981680090303695, - "acc_norm": 0.6208922525393348, - "acc_norm_stderr": 0.004841734453506668 - }, - "rte": { - "acc": 0.48375451263537905, - "acc_stderr": 0.030080573208738064 - }, - "winogrande": { - "acc": 0.574585635359116, - "acc_stderr": 0.013895257666646382 - }, - "storycloze_2016": { - "acc": 0.7183324425440941, - "acc_stderr": 0.010401844358587665 - }, - "boolq": { - "acc": 0.6039755351681957, - "acc_stderr": 0.008553881336813413 - }, - "arc_easy": { - "acc": 0.6191077441077442, - "acc_stderr": 0.009964428212260372, - "acc_norm": 0.5921717171717171, - "acc_norm_stderr": 0.010083950240041223 - }, - "arc_challenge": { - "acc": 0.302901023890785, - "acc_stderr": 0.013428241573185349, - "acc_norm": 0.3174061433447099, - "acc_norm_stderr": 0.01360223908803817 - }, - "sciq": { - "acc": 0.899, - "acc_stderr": 0.009533618929340997, - "acc_norm": 0.877, - "acc_norm_stderr": 0.010391293421849877 - }, - "piqa": { - "acc": 0.7524483133841132, - "acc_stderr": 0.010069703966857104, - "acc_norm": 0.7633297062023939, - "acc_norm_stderr": 0.009916841655042809 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..a02f2d6a33b92bfcdc9d53d1e63fab115bbb2a74 --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.306,0.014580006055436967,0 +anli_r2,acc,0.361,0.015195720118175122,0 +anli_r3,acc,0.31583333333333335,0.01342456883035645,0 +arc_challenge,acc,0.30119453924914674,0.01340674176784762,0 +arc_challenge,acc_norm,0.3250853242320819,0.013688147309729122,0 +arc_easy,acc,0.622895622895623,0.009945041946366515,0 +arc_easy,acc_norm,0.6035353535353535,0.01003741276306453,0 +boolq,acc,0.6042813455657492,0.008552742471459792,1 +cb,acc,0.35714285714285715,0.06460957383809221,1 +cb,f1,0.2902724515627741,,1 +copa,acc,0.81,0.03942772444036623,0 +hellaswag,acc,0.4742083250348536,0.0049831384796043795,0 +hellaswag,acc_norm,0.6274646484763992,0.004824917516374194,0 +piqa,acc,0.7573449401523396,0.0100020025697087,0 +piqa,acc_norm,0.764417845484222,0.009901067586473893,0 +rte,acc,0.5667870036101083,0.029826764082138277,0 +sciq,acc,0.897,0.009616833339695798,0 +sciq,acc_norm,0.882,0.010206869264381791,0 +storycloze_2016,acc,0.7167290219134153,0.010419760409155363,0 +winogrande,acc,0.580110497237569,0.013870943986310391,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3_lm-eval_global_step80108_2023-02-24-15-37-26_3shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3_lm-eval_global_step80108_2023-02-24-15-37-26_3shots_backup.json deleted file mode 100644 index 21b41f0b389920760579eb067c5abd57e9f956bd..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_3_lm-eval_global_step80108_2023-02-24-15-37-26_3shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.306, - "acc_stderr": 0.014580006055436967 - }, - "anli_r2": { - "acc": 0.361, - "acc_stderr": 0.015195720118175122 - }, - "anli_r3": { - "acc": 0.31583333333333335, - "acc_stderr": 0.01342456883035645 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.06460957383809221, - "f1": 0.2902724515627741 - }, - "copa": { - "acc": 0.81, - "acc_stderr": 0.03942772444036623 - }, - "hellaswag": { - "acc": 0.4742083250348536, - "acc_stderr": 0.0049831384796043795, - "acc_norm": 0.6274646484763992, - "acc_norm_stderr": 0.004824917516374194 - }, - "rte": { - "acc": 0.5667870036101083, - "acc_stderr": 0.029826764082138277 - }, - "winogrande": { - "acc": 0.580110497237569, - "acc_stderr": 0.013870943986310391 - }, - "storycloze_2016": { - "acc": 0.7167290219134153, - "acc_stderr": 0.010419760409155363 - }, - "boolq": { - "acc": 0.6042813455657492, - "acc_stderr": 0.008552742471459792 - }, - "arc_easy": { - "acc": 0.622895622895623, - "acc_stderr": 0.009945041946366515, - "acc_norm": 0.6035353535353535, - "acc_norm_stderr": 0.01003741276306453 - }, - "arc_challenge": { - "acc": 0.30119453924914674, - "acc_stderr": 0.01340674176784762, - "acc_norm": 0.3250853242320819, - "acc_norm_stderr": 0.013688147309729122 - }, - "sciq": { - "acc": 0.897, - "acc_stderr": 0.009616833339695798, - "acc_norm": 0.882, - "acc_norm_stderr": 0.010206869264381791 - }, - "piqa": { - "acc": 0.7573449401523396, - "acc_stderr": 0.0100020025697087, - "acc_norm": 0.764417845484222, - "acc_norm_stderr": 0.009901067586473893 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..41c379aaf54da4bcbf2bb0280467c757f4221cf2 --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.347,0.015060472031706622,0 +anli_r2,acc,0.361,0.015195720118175127,0 +anli_r3,acc,0.34,0.013680495725767787,0 +arc_challenge,acc,0.30119453924914674,0.01340674176784762,0 +arc_challenge,acc_norm,0.3370307167235495,0.013813476652902269,0 +arc_easy,acc,0.6283670033670034,0.009915897123658788,0 +arc_easy,acc_norm,0.5972222222222222,0.010063960494989163,0 +boolq,acc,0.5984709480122324,0.008573784490094754,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.266719222178426,,1 +copa,acc,0.82,0.038612291966536955,0 +hellaswag,acc,0.47400916152160927,0.004983035420235718,0 +hellaswag,acc_norm,0.6275642302330213,0.004824655406075563,0 +piqa,acc,0.7546245919477693,0.010039831320422396,0 +piqa,acc_norm,0.766050054406964,0.009877236895137446,0 +rte,acc,0.4981949458483754,0.030096267148976626,0 +sciq,acc,0.911,0.009008893392651518,0 +sciq,acc_norm,0.898,0.009575368801653892,0 +storycloze_2016,acc,0.7247461250668092,0.010328538400500567,0 +winogrande,acc,0.584846093133386,0.013848684086658585,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json deleted file mode 100644 index 1dfa15c148c4961db73d79b0191b5560e91df659..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_4_lm-eval_global_step80108_2023-02-24-15-37-26_4shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.347, - "acc_stderr": 0.015060472031706622 - }, - "anli_r2": { - "acc": 0.361, - "acc_stderr": 0.015195720118175127 - }, - "anli_r3": { - "acc": 0.34, - "acc_stderr": 0.013680495725767787 - }, - "cb": { - "acc": 0.375, - "acc_stderr": 0.06527912098338669, - "f1": 0.266719222178426 - }, - "copa": { - "acc": 0.82, - "acc_stderr": 0.038612291966536955 - }, - "hellaswag": { - "acc": 0.47400916152160927, - "acc_stderr": 0.004983035420235718, - "acc_norm": 0.6275642302330213, - "acc_norm_stderr": 0.004824655406075563 - }, - "rte": { - "acc": 0.4981949458483754, - "acc_stderr": 0.030096267148976626 - }, - "winogrande": { - "acc": 0.584846093133386, - "acc_stderr": 0.013848684086658585 - }, - "storycloze_2016": { - "acc": 0.7247461250668092, - "acc_stderr": 0.010328538400500567 - }, - "boolq": { - "acc": 0.5984709480122324, - "acc_stderr": 0.008573784490094754 - }, - "arc_easy": { - "acc": 0.6283670033670034, - "acc_stderr": 0.009915897123658788, - "acc_norm": 0.5972222222222222, - "acc_norm_stderr": 0.010063960494989163 - }, - "arc_challenge": { - "acc": 0.30119453924914674, - "acc_stderr": 0.01340674176784762, - "acc_norm": 0.3370307167235495, - "acc_norm_stderr": 0.013813476652902269 - }, - "sciq": { - "acc": 0.911, - "acc_stderr": 0.009008893392651518, - "acc_norm": 0.898, - "acc_norm_stderr": 0.009575368801653892 - }, - "piqa": { - "acc": 0.7546245919477693, - "acc_stderr": 0.010039831320422396, - "acc_norm": 0.766050054406964, - "acc_norm_stderr": 0.009877236895137446 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5.csv b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..747f98b935cfe934f599b6c687bf35df5614e284 --- /dev/null +++ b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.334,0.014922019523732977,0 +anli_r2,acc,0.351,0.015100563798316407,0 +anli_r3,acc,0.32083333333333336,0.013480882752851543,0 +arc_challenge,acc,0.3003412969283277,0.013395909309956995,0 +arc_challenge,acc_norm,0.3225255972696246,0.013659980894277373,0 +arc_easy,acc,0.6258417508417509,0.009929516948977627,0 +arc_easy,acc_norm,0.6022727272727273,0.010042861602178058,0 +boolq,acc,0.6073394495412844,0.008541161248702913,1 +cb,acc,0.42857142857142855,0.06672848092813058,1 +cb,f1,0.2943692088382039,,1 +copa,acc,0.77,0.04229525846816506,0 +hellaswag,acc,0.472814180442143,0.004982400368939667,0 +hellaswag,acc_norm,0.6304521011750648,0.00481695881772609,0 +piqa,acc,0.7513601741022851,0.010084511234296852,0 +piqa,acc_norm,0.7687704026115343,0.009837063180625336,0 +rte,acc,0.5415162454873647,0.029992535385373314,0 +sciq,acc,0.914,0.008870325962594766,0 +sciq,acc_norm,0.901,0.009449248027662734,0 +storycloze_2016,acc,0.7188669160876536,0.0103958360916281,0 +winogrande,acc,0.5911602209944752,0.0138169542951357,0 diff --git a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5_lm-eval_global_step80108_2023-02-24-15-37-25_5shots_backup.json b/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5_lm-eval_global_step80108_2023-02-24-15-37-25_5shots_backup.json deleted file mode 100644 index c19fa92879d665ed88cd89445cedfd9cbf1faa77..0000000000000000000000000000000000000000 --- a/4b284b84bc4v2seed4/evaluation/rankeval/4b284b84bc4v2seed4_5_lm-eval_global_step80108_2023-02-24-15-37-25_5shots_backup.json +++ /dev/null @@ -1,87 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.334, - "acc_stderr": 0.014922019523732977 - }, - "anli_r2": { - "acc": 0.351, - "acc_stderr": 0.015100563798316407 - }, - "anli_r3": { - "acc": 0.32083333333333336, - "acc_stderr": 0.013480882752851543 - }, - "cb": { - "acc": 0.42857142857142855, - "acc_stderr": 0.06672848092813058, - "f1": 0.2943692088382039 - }, - "copa": { - "acc": 0.77, - "acc_stderr": 0.04229525846816506 - }, - "hellaswag": { - "acc": 0.472814180442143, - "acc_stderr": 0.004982400368939667, - "acc_norm": 0.6304521011750648, - "acc_norm_stderr": 0.00481695881772609 - }, - "rte": { - "acc": 0.5415162454873647, - "acc_stderr": 0.029992535385373314 - }, - "winogrande": { - "acc": 0.5911602209944752, - "acc_stderr": 0.0138169542951357 - }, - "storycloze_2016": { - "acc": 0.7188669160876536, - "acc_stderr": 0.0103958360916281 - }, - "boolq": { - "acc": 0.6073394495412844, - "acc_stderr": 0.008541161248702913 - }, - "arc_easy": { - "acc": 0.6258417508417509, - "acc_stderr": 0.009929516948977627, - "acc_norm": 0.6022727272727273, - "acc_norm_stderr": 0.010042861602178058 - }, - "arc_challenge": { - "acc": 0.3003412969283277, - "acc_stderr": 0.013395909309956995, - "acc_norm": 0.3225255972696246, - "acc_norm_stderr": 0.013659980894277373 - }, - "sciq": { - "acc": 0.914, - "acc_stderr": 0.008870325962594766, - "acc_norm": 0.901, - "acc_norm_stderr": 0.009449248027662734 - }, - "piqa": { - "acc": 0.7513601741022851, - "acc_stderr": 0.010084511234296852, - "acc_norm": 0.7687704026115343, - "acc_norm_stderr": 0.009837063180625336 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0, - "hellaswag": 0, - "rte": 0, - "winogrande": 0, - "storycloze_2016": 0, - "boolq": 1, - "arc_easy": 0, - "arc_challenge": 0, - "sciq": 0, - "piqa": 0 - } -} \ No newline at end of file