diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..17f13a1ed44946e2a4b73d3349c49e0c066629e2 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.2562657244023514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028344931968085282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.062384701456087446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020959572066061704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2778238258177499, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0053586090146941585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.08917647320322222, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001969587935532424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.026957871325524504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00119791534895436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12120656565208492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003245668709085193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.03885973727074715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011411233576479184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.059948665447718395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001993918360545074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2696777384075072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005223094399514082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08588431201368854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018539010477647254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.05914169795430463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020593373696654797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.25838316242026105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00491849430291812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08392617035294098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001881634203344931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1c4e1615865df649dad080268b80baa90753835a --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.44906636509622533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.039450588630595015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.11288061517490074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00386994874457578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.27968139569340955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004818798730073509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13316786961188862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032035615287408364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.052903771126418274, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024560083140397187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13670929059101208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032752111527625427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0635687916821055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020746952988998694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.10294903334837356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034785078085129637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.26443193955147987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004508721603501755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12246686808714284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027865982212275067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1043770278231783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035484629358203635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2648398336113232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004503000799632382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12355055054182587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002841014595626855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..bc8a4d7237e68e1304a4fe013f4f8e7711d26c00 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4687852352128821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037809750600987985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.12939788800380767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004056238000462917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3205721940394599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004784451352989951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15485025651871712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034302178802564512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06272684866048131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002451752863709562}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16040449198937512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003451151403396264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07545945954698206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021937365311268525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11538953087200472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00346589668028597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30175228398138576, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004432821529779081}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.14062442945039813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028845693981807787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11767324801231895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035688465184641432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3033452610052125, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004447592173686305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.14250354572951518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002959772587239683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a230bfd7d9bec485da9e0a2a7d0ad2b6d6b9586a --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5387522714694768, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023700066593108533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.14028377513050538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004341036446200867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33808480225227105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004840897586860907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1660401631118107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037496181550606754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07249658632299814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029199126351436076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1738659001719329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003647562909324833}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08448456167004659, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002619780755266116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12529450659786737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003740919506421422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3182618579312013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004525730253384199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15093871662748365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0032225789949202046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.12866365606890057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003897701131855631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3209266755398485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004550196873941636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15379137375695537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033296076273151513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..90865ba04e38183e8de5ce969fb3a9308f936f0c --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6154728892401495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05492052153457257}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1416216780254008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004403560474088744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.350369181874358, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004821260815960307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.16918194985742865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003726050287797417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07377427634131609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029769913238928973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18193245830782417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003657633632618109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08639654396535042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025466510150414677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12691040102952247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038030608678551996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33016396642156515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004483672398798779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15419230913013102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031966205747524113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1301329876623986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003962229410505262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3326898264482343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004514627367420683}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15686161901707832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003303387308503824}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8cf2365c87d86707dfad7e575b4cb74ca8c336b2 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6280080728696126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04207890973166607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.15509132478375567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004705843597864089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36072408074578627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004932090632111422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1785781279253249, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003864561091208021}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.08180998073756966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031797028106942695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1895120629191651, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037580265385154963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.09234282351104058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026995310439716595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.13813720543380648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004053616600136467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33922496409575775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004622505715509403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1620344936907299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003324512248892316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.14168764741861828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004239364453934911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.34078189394441194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004625313010789047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.16446961325308998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034245269422962608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..8a134d07c0309a33d6725b18834226837cf3dc3a --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14831832571904127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020031574591708096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.23669379639579727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002619263402445374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16843326604027012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018787988149176356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.027495279137204204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008008675778483577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04558631579219217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013034624565288761}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03124497623150672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000826359540629872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11944058601706234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014613554843883904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.197662087047554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022098401272736727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1373698337101501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014061340796650912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13662513471708013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001834224913870075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21848967035350617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00242500324850602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15530559329563356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017277092690194619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4416513225274035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07150581722616807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..475998c237a807d9d5d36709a3cfcba6a0fdd3de --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2474989333475308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004192775409890903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.16589702996724112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002477684211999076}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1595930665634592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020260063889951897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06456439217534583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026036519415425876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.03395563608028862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011867665512589468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03442090774378983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011197085684525982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2007100384592662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036402244436710527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.12968060330920403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019100623389590725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12543184498335516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015786460435038327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2356687196157363, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004040097148447038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15692818247747958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023149233432171795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15113313197846198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019006022698345565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.9112541973999004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08163120654641273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5f5e4839e410ad3a966e6c304c46ec5e3744ddd0 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3671552578777415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00464310641179873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.18316424412100443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002534818661881543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20013093911819188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021449192871676984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.11652986149569292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033165278944766326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.048557963351484004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012897480876968325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05444986552431719, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012835369876590637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.3030100733810117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004160446544180668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1454519391726235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020102539956042787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1600577177279773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017194821563251488}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.34948494261958846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00452044298551405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.17241597783925564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023681037127358745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18899973850835908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020281161914398288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.4920293719291493, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06820740373159184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..d786a5cc8dd6655cd3010e63710e5b0eec5d4a51 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31853023306817924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005101746577027814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.14722792301810253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026467115286204013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16433308908585867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002402134739956357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.10260882729326853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033488070271773295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04037648707007591, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012615029724490608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04583954485462417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012897030660356447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.26690611784240603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004530048771512807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11874183155575303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021251987841602225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13364146877613045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00195553742839865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.3046492013868888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004949401564936648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1393491071946238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024976587731928646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15587618164528091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002273493914941302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.6055821821151304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06565825928935355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e0d4e4ca13ac6541ca2fdf4ba7efd9a06a8d19a --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.10914047578759525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004168163234199949}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.04875414041264775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020018963702338544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05455257649733116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002012404693346845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.037261877113724175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002437336913811512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.013448412424643126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008658300088105147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01577378906241384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009385589100462621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.09325777757588789, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003680574837942309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.03991105511182912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016279952050045603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.045100032510421204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016666399875103265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.10390247551362326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004006559051889402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.04597031630948619, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018861168846922784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.051422475813845084, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018931617381586926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.006932560692974423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0017990239865701577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6b0cd28d51bcb0fa91851c98443c37dc91002ebd --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.019024182319545703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019567530452185085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.00809080049652358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008930277662354756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009250948371284507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009365636508956353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.007878162042298584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012119853940782482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0024981499124714754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00036249889299460995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0031062476201456015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043823448085888057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.016781942931569222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017964368358031513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.006703878314586593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007330971808896851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007784924334651645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007922903839285351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01855296243342967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001923182050974051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.00783323994370169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008689584836341853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008943947340808445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009064353361518755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5486400435190646e-20, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.928974165985541e-18}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..db07e847193c6ecbe5f1d364a8cc85498c3edaf4 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06536803730199292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040080446277792475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0562578647719299, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003466710519052521}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05638135159346031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033862796262482922}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.013453387419618108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014082914388032289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.012484556808642255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012859869239756115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01218356666574454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012325430366165824}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.049695294130091966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031067522139507655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04191724945748147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026229390757436582}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.042197213021247744, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025684294584781297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0501407378152416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00312422462775062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04244935232602974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026544789695008624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.042656324273577836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002590374442922286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.114133025618453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03902166340342278}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c793b055cfb7f8b2b49ff5b83c3b83a89871d074 --- /dev/null +++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9d45c2505bca1318cda5c7928e1902f1eac12e1d 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da240232ef9803594f89287674f0f363b3ee085ab5ada3f7ba1fcf42c74238a9 +size 4174292 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..66a02feda232d18a6b6dce7f60a434a9c5f67645 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1792872b8360c67aa01618d992d596079f04524a1f1503f34e0c5a8ea05d498 +size 4762910 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..557b9aea14a3f72b806733a789b376e00039e7c7 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b9f81e6fa3c44ef15ccfd5e9a1b3a9c9e0ca5f9d010e2e5a1f113c7e000edae +size 5711084 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..29e11ea80075b543fbfa3eadec1fe22923104d3b 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6815796d6e2d668597f9433d5131bd8c3111fe44288ea40c86c6e7ce22c2c5a +size 6596539 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..88686f9ab4b7d88483b3d0b86632ddaedaf0ca91 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bfb2896e5849621f3f7a89e6828f03535322465ea21e99c1dc62eee20af2496 +size 7512274 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c6307ae5f63066e55ad7c123b9d415224301fdf8 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6857bff28723014fc359741f07d82b77ccad062bda89d8b023ae9517dfde3048 +size 8376795 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..732d3df044b54c4d576e765af6b66cf7d49ede50 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6cae8b73c1ba64dd6f8a95d50808d0f96c3442588e0ee7c521e666397beb40 +size 12958324 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..aa5c07f6c59078d2fbbe29e9a98246ef8ad15065 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa0eff950ed43d1c253f2b1a3f259895bbffa3bee9d2b6502468fe7c3de050b8 +size 18431550 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8a1a1aae7e88cf0a75a6669233bc65f8063297f9 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7974ff13a24110559f516d7f81ffc6072c020bc498ceca28c4e7640bfc930e9e +size 23921835 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..26925b4e534bbded415d06a83413693fe11b0c6a 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b312eade5aa25054bd7f12b3e19c5d7ea6499e17edc996b6caa4bdcfca0d531a +size 29333993 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..30255018b0818df0e08f651520126acc6b1c7bce 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e70db861ffdbfc8cf88d452d03a5b56b13a4b70c3a538eb70524271e3f7804 +size 34778872 diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0aaee5fe029787cee5ee904b4410f5c9a2883942 100644 --- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl +++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d4d0227bd4d26f3e9a99d734f3d030328c20d626c6f922920b3dc155977f38 +size 13896009 diff --git a/evaluation/generation/merged.csv b/evaluation/generation/merged.csv new file mode 100644 index 0000000000000000000000000000000000000000..5f415c367a2fa331512a8a072e9caca8883e8766 --- /dev/null +++ b/evaluation/generation/merged.csv @@ -0,0 +1,53 @@ +dataset,fewshots,prompt,metric,value +e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.07031535968691954 +e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.07031535968691954 +e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21317325143407617 +e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21317325143407617 +e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23375071908985015 +e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23375071908985015 +e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.238646665315093 +e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.238646665315093 +e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23937430569965518 +e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23937430569965518 +e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23628094628078694 +e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23628094628078694 +e2e_nlg_cleaned,5,average,multiple,0.20525687458439684 +gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.02780714664643612 +gem_xsum,0,median,rouge2_fmeasure,0.02780714664643612 +gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04500583650450346 +gem_xsum,1,median,rouge2_fmeasure,0.04500583650450346 +gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.051882326943507785 +gem_xsum,2,median,rouge2_fmeasure,0.051882326943507785 +gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05187808695380671 +gem_xsum,3,median,rouge2_fmeasure,0.05187808695380671 +gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01218356666574454 +gem_xsum,4,median,rouge2_fmeasure,0.01218356666574454 +gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0 +gem_xsum,5,median,rouge2_fmeasure,0.0 +gem_xsum,5,average,multiple,0.031459493952333106 +web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03885973727074715 +web_nlg_en,0,median,rouge2_fmeasure,0.03885973727074715 +web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0635687916821055 +web_nlg_en,1,median,rouge2_fmeasure,0.0635687916821055 +web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.07545945954698206 +web_nlg_en,2,median,rouge2_fmeasure,0.07545945954698206 +web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.08448456167004659 +web_nlg_en,3,median,rouge2_fmeasure,0.08448456167004659 +web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.08639654396535042 +web_nlg_en,4,median,rouge2_fmeasure,0.08639654396535042 +web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.09234282351104058 +web_nlg_en,5,median,rouge2_fmeasure,0.09234282351104058 +web_nlg_en,5,average,multiple,0.07351865294104538 +wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03124497623150672 +wiki_lingua_en,0,median,rouge2_fmeasure,0.03124497623150672 +wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03442090774378983 +wiki_lingua_en,1,median,rouge2_fmeasure,0.03442090774378983 +wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05444986552431719 +wiki_lingua_en,2,median,rouge2_fmeasure,0.05444986552431719 +wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04583954485462417 +wiki_lingua_en,3,median,rouge2_fmeasure,0.04583954485462417 +wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01577378906241384 +wiki_lingua_en,4,median,rouge2_fmeasure,0.01577378906241384 +wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031062476201456015 +wiki_lingua_en,5,median,rouge2_fmeasure,0.0031062476201456015 +wiki_lingua_en,5,average,multiple,0.030805888506132893 diff --git a/evaluation/generation/merged.json b/evaluation/generation/merged.json new file mode 100644 index 0000000000000000000000000000000000000000..19363da1a22475f96542a0f68f986c6959491b00 --- /dev/null +++ b/evaluation/generation/merged.json @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2562657244023514, "bleu_stderr": 0.028344931968085282, "rouge1_fmeasure": 0.08917647320322222, "rouge1_fmeasure_stderr": 0.001969587935532424, "rouge1_precision": 0.062384701456087446, "rouge1_precision_stderr": 0.0020959572066061704, "rouge1_recall": 0.2778238258177499, "rouge1_recall_stderr": 0.0053586090146941585, "rouge2_fmeasure": 0.03885973727074715, "rouge2_fmeasure_stderr": 0.0011411233576479184, "rouge2_precision": 0.026957871325524504, "rouge2_precision_stderr": 0.00119791534895436, "rouge2_recall": 0.12120656565208492, "rouge2_recall_stderr": 0.003245668709085193, "rougeL_fmeasure": 0.08588431201368854, "rougeL_fmeasure_stderr": 0.0018539010477647254, "rougeL_precision": 0.059948665447718395, "rougeL_precision_stderr": 0.001993918360545074, "rougeL_recall": 0.2696777384075072, "rougeL_recall_stderr": 0.005223094399514082, "rougeLsum_fmeasure": 0.08392617035294098, "rougeLsum_fmeasure_stderr": 0.001881634203344931, "rougeLsum_precision": 0.05914169795430463, "rougeLsum_precision_stderr": 0.0020593373696654797, "rougeLsum_recall": 0.25838316242026105, "rougeLsum_recall_stderr": 0.00491849430291812}}, "1": {"PALM_prompt": {"bleu": 0.44906636509622533, "bleu_stderr": 0.039450588630595015, "rouge1_fmeasure": 0.13316786961188862, "rouge1_fmeasure_stderr": 0.0032035615287408364, "rouge1_precision": 0.11288061517490074, "rouge1_precision_stderr": 0.00386994874457578, "rouge1_recall": 0.27968139569340955, "rouge1_recall_stderr": 0.004818798730073509, "rouge2_fmeasure": 0.0635687916821055, "rouge2_fmeasure_stderr": 0.0020746952988998694, "rouge2_precision": 0.052903771126418274, "rouge2_precision_stderr": 0.0024560083140397187, "rouge2_recall": 0.13670929059101208, "rouge2_recall_stderr": 0.0032752111527625427, "rougeL_fmeasure": 0.12246686808714284, "rougeL_fmeasure_stderr": 0.0027865982212275067, "rougeL_precision": 0.10294903334837356, "rougeL_precision_stderr": 0.0034785078085129637, "rougeL_recall": 0.26443193955147987, "rougeL_recall_stderr": 0.004508721603501755, "rougeLsum_fmeasure": 0.12355055054182587, "rougeLsum_fmeasure_stderr": 0.002841014595626855, "rougeLsum_precision": 0.1043770278231783, "rougeLsum_precision_stderr": 0.0035484629358203635, "rougeLsum_recall": 0.2648398336113232, "rougeLsum_recall_stderr": 0.004503000799632382}}, "2": {"PALM_prompt": {"bleu": 0.4687852352128821, "bleu_stderr": 0.037809750600987985, "rouge1_fmeasure": 0.15485025651871712, "rouge1_fmeasure_stderr": 0.0034302178802564512, "rouge1_precision": 0.12939788800380767, "rouge1_precision_stderr": 0.004056238000462917, "rouge1_recall": 0.3205721940394599, "rouge1_recall_stderr": 0.004784451352989951, "rouge2_fmeasure": 0.07545945954698206, "rouge2_fmeasure_stderr": 0.0021937365311268525, "rouge2_precision": 0.06272684866048131, "rouge2_precision_stderr": 0.002451752863709562, "rouge2_recall": 0.16040449198937512, "rouge2_recall_stderr": 0.003451151403396264, "rougeL_fmeasure": 0.14062442945039813, "rougeL_fmeasure_stderr": 0.0028845693981807787, "rougeL_precision": 0.11538953087200472, "rougeL_precision_stderr": 0.00346589668028597, "rougeL_recall": 0.30175228398138576, "rougeL_recall_stderr": 0.004432821529779081, "rougeLsum_fmeasure": 0.14250354572951518, "rougeLsum_fmeasure_stderr": 0.002959772587239683, "rougeLsum_precision": 0.11767324801231895, "rougeLsum_precision_stderr": 0.0035688465184641432, "rougeLsum_recall": 0.3033452610052125, "rougeLsum_recall_stderr": 0.004447592173686305}}, "3": {"PALM_prompt": {"bleu": 0.5387522714694768, "bleu_stderr": 0.023700066593108533, "rouge1_fmeasure": 0.1660401631118107, "rouge1_fmeasure_stderr": 0.0037496181550606754, "rouge1_precision": 0.14028377513050538, "rouge1_precision_stderr": 0.004341036446200867, "rouge1_recall": 0.33808480225227105, "rouge1_recall_stderr": 0.004840897586860907, "rouge2_fmeasure": 0.08448456167004659, "rouge2_fmeasure_stderr": 0.002619780755266116, "rouge2_precision": 0.07249658632299814, "rouge2_precision_stderr": 0.0029199126351436076, "rouge2_recall": 0.1738659001719329, "rouge2_recall_stderr": 0.003647562909324833, "rougeL_fmeasure": 0.15093871662748365, "rougeL_fmeasure_stderr": 0.0032225789949202046, "rougeL_precision": 0.12529450659786737, "rougeL_precision_stderr": 0.003740919506421422, "rougeL_recall": 0.3182618579312013, "rougeL_recall_stderr": 0.004525730253384199, "rougeLsum_fmeasure": 0.15379137375695537, "rougeLsum_fmeasure_stderr": 0.0033296076273151513, "rougeLsum_precision": 0.12866365606890057, "rougeLsum_precision_stderr": 0.003897701131855631, "rougeLsum_recall": 0.3209266755398485, "rougeLsum_recall_stderr": 0.004550196873941636}}, "4": {"PALM_prompt": {"bleu": 0.6154728892401495, "bleu_stderr": 0.05492052153457257, "rouge1_fmeasure": 0.16918194985742865, "rouge1_fmeasure_stderr": 0.003726050287797417, "rouge1_precision": 0.1416216780254008, "rouge1_precision_stderr": 0.004403560474088744, "rouge1_recall": 0.350369181874358, "rouge1_recall_stderr": 0.004821260815960307, "rouge2_fmeasure": 0.08639654396535042, "rouge2_fmeasure_stderr": 0.0025466510150414677, "rouge2_precision": 0.07377427634131609, "rouge2_precision_stderr": 0.0029769913238928973, "rouge2_recall": 0.18193245830782417, "rouge2_recall_stderr": 0.003657633632618109, "rougeL_fmeasure": 0.15419230913013102, "rougeL_fmeasure_stderr": 0.0031966205747524113, "rougeL_precision": 0.12691040102952247, "rougeL_precision_stderr": 0.0038030608678551996, "rougeL_recall": 0.33016396642156515, "rougeL_recall_stderr": 0.004483672398798779, "rougeLsum_fmeasure": 0.15686161901707832, "rougeLsum_fmeasure_stderr": 0.003303387308503824, "rougeLsum_precision": 0.1301329876623986, "rougeLsum_precision_stderr": 0.003962229410505262, "rougeLsum_recall": 0.3326898264482343, "rougeLsum_recall_stderr": 0.004514627367420683}}, "5": {"PALM_prompt": {"bleu": 0.6280080728696126, "bleu_stderr": 0.04207890973166607, "rouge1_fmeasure": 0.1785781279253249, "rouge1_fmeasure_stderr": 0.003864561091208021, "rouge1_precision": 0.15509132478375567, "rouge1_precision_stderr": 0.004705843597864089, "rouge1_recall": 0.36072408074578627, "rouge1_recall_stderr": 0.004932090632111422, "rouge2_fmeasure": 0.09234282351104058, "rouge2_fmeasure_stderr": 0.0026995310439716595, "rouge2_precision": 0.08180998073756966, "rouge2_precision_stderr": 0.0031797028106942695, "rouge2_recall": 0.1895120629191651, "rouge2_recall_stderr": 0.0037580265385154963, "rougeL_fmeasure": 0.1620344936907299, "rougeL_fmeasure_stderr": 0.003324512248892316, "rougeL_precision": 0.13813720543380648, "rougeL_precision_stderr": 0.004053616600136467, "rougeL_recall": 0.33922496409575775, "rougeL_recall_stderr": 0.004622505715509403, "rougeLsum_fmeasure": 0.16446961325308998, "rougeLsum_fmeasure_stderr": 0.0034245269422962608, "rougeLsum_precision": 0.14168764741861828, "rougeLsum_precision_stderr": 0.004239364453934911, "rougeLsum_recall": 0.34078189394441194, "rougeLsum_recall_stderr": 0.004625313010789047}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4416513225274035, "bleu_stderr": 0.07150581722616807, "rouge1_fmeasure": 0.16843326604027012, "rouge1_fmeasure_stderr": 0.0018787988149176356, "rouge1_precision": 0.14831832571904127, "rouge1_precision_stderr": 0.0020031574591708096, "rouge1_recall": 0.23669379639579727, "rouge1_recall_stderr": 0.002619263402445374, "rouge2_fmeasure": 0.03124497623150672, "rouge2_fmeasure_stderr": 0.000826359540629872, "rouge2_precision": 0.027495279137204204, "rouge2_precision_stderr": 0.0008008675778483577, "rouge2_recall": 0.04558631579219217, "rouge2_recall_stderr": 0.0013034624565288761, "rougeL_fmeasure": 0.1373698337101501, "rougeL_fmeasure_stderr": 0.0014061340796650912, "rougeL_precision": 0.11944058601706234, "rougeL_precision_stderr": 0.0014613554843883904, "rougeL_recall": 0.197662087047554, "rougeL_recall_stderr": 0.0022098401272736727, "rougeLsum_fmeasure": 0.15530559329563356, "rougeLsum_fmeasure_stderr": 0.0017277092690194619, "rougeLsum_precision": 0.13662513471708013, "rougeLsum_precision_stderr": 0.001834224913870075, "rougeLsum_recall": 0.21848967035350617, "rougeLsum_recall_stderr": 0.00242500324850602}}, "1": {"tldr_en": {"bleu": 1.9112541973999004, "bleu_stderr": 0.08163120654641273, "rouge1_fmeasure": 0.1595930665634592, "rouge1_fmeasure_stderr": 0.0020260063889951897, "rouge1_precision": 0.2474989333475308, "rouge1_precision_stderr": 0.004192775409890903, "rouge1_recall": 0.16589702996724112, "rouge1_recall_stderr": 0.002477684211999076, "rouge2_fmeasure": 0.03442090774378983, "rouge2_fmeasure_stderr": 0.0011197085684525982, "rouge2_precision": 0.06456439217534583, "rouge2_precision_stderr": 0.0026036519415425876, "rouge2_recall": 0.03395563608028862, "rouge2_recall_stderr": 0.0011867665512589468, "rougeL_fmeasure": 0.12543184498335516, "rougeL_fmeasure_stderr": 0.0015786460435038327, "rougeL_precision": 0.2007100384592662, "rougeL_precision_stderr": 0.0036402244436710527, "rougeL_recall": 0.12968060330920403, "rougeL_recall_stderr": 0.0019100623389590725, "rougeLsum_fmeasure": 0.15113313197846198, "rougeLsum_fmeasure_stderr": 0.0019006022698345565, "rougeLsum_precision": 0.2356687196157363, "rougeLsum_precision_stderr": 0.004040097148447038, "rougeLsum_recall": 0.15692818247747958, "rougeLsum_recall_stderr": 0.0023149233432171795}}, "2": {"tldr_en": {"bleu": 2.4920293719291493, "bleu_stderr": 0.06820740373159184, "rouge1_fmeasure": 0.20013093911819188, "rouge1_fmeasure_stderr": 0.0021449192871676984, "rouge1_precision": 0.3671552578777415, "rouge1_precision_stderr": 0.00464310641179873, "rouge1_recall": 0.18316424412100443, "rouge1_recall_stderr": 0.002534818661881543, "rouge2_fmeasure": 0.05444986552431719, "rouge2_fmeasure_stderr": 0.0012835369876590637, "rouge2_precision": 0.11652986149569292, "rouge2_precision_stderr": 0.0033165278944766326, "rouge2_recall": 0.048557963351484004, "rouge2_recall_stderr": 0.0012897480876968325, "rougeL_fmeasure": 0.1600577177279773, "rougeL_fmeasure_stderr": 0.0017194821563251488, "rougeL_precision": 0.3030100733810117, "rougeL_precision_stderr": 0.004160446544180668, "rougeL_recall": 0.1454519391726235, "rougeL_recall_stderr": 0.0020102539956042787, "rougeLsum_fmeasure": 0.18899973850835908, "rougeLsum_fmeasure_stderr": 0.0020281161914398288, "rougeLsum_precision": 0.34948494261958846, "rougeLsum_precision_stderr": 0.00452044298551405, "rougeLsum_recall": 0.17241597783925564, "rougeLsum_recall_stderr": 0.0023681037127358745}}, "3": {"tldr_en": {"bleu": 1.6055821821151304, "bleu_stderr": 0.06565825928935355, "rouge1_fmeasure": 0.16433308908585867, "rouge1_fmeasure_stderr": 0.002402134739956357, "rouge1_precision": 0.31853023306817924, "rouge1_precision_stderr": 0.005101746577027814, "rouge1_recall": 0.14722792301810253, "rouge1_recall_stderr": 0.0026467115286204013, "rouge2_fmeasure": 0.04583954485462417, "rouge2_fmeasure_stderr": 0.0012897030660356447, "rouge2_precision": 0.10260882729326853, "rouge2_precision_stderr": 0.0033488070271773295, "rouge2_recall": 0.04037648707007591, "rouge2_recall_stderr": 0.0012615029724490608, "rougeL_fmeasure": 0.13364146877613045, "rougeL_fmeasure_stderr": 0.00195553742839865, "rougeL_precision": 0.26690611784240603, "rougeL_precision_stderr": 0.004530048771512807, "rougeL_recall": 0.11874183155575303, "rougeL_recall_stderr": 0.0021251987841602225, "rougeLsum_fmeasure": 0.15587618164528091, "rougeLsum_fmeasure_stderr": 0.002273493914941302, "rougeLsum_precision": 0.3046492013868888, "rougeLsum_precision_stderr": 0.004949401564936648, "rougeLsum_recall": 0.1393491071946238, "rougeLsum_recall_stderr": 0.0024976587731928646}}, "4": {"tldr_en": {"bleu": 0.006932560692974423, "bleu_stderr": 0.0017990239865701577, "rouge1_fmeasure": 0.05455257649733116, "rouge1_fmeasure_stderr": 0.002012404693346845, "rouge1_precision": 0.10914047578759525, "rouge1_precision_stderr": 0.004168163234199949, "rouge1_recall": 0.04875414041264775, "rouge1_recall_stderr": 0.0020018963702338544, "rouge2_fmeasure": 0.01577378906241384, "rouge2_fmeasure_stderr": 0.0009385589100462621, "rouge2_precision": 0.037261877113724175, "rouge2_precision_stderr": 0.002437336913811512, "rouge2_recall": 0.013448412424643126, "rouge2_recall_stderr": 0.0008658300088105147, "rougeL_fmeasure": 0.045100032510421204, "rougeL_fmeasure_stderr": 0.0016666399875103265, "rougeL_precision": 0.09325777757588789, "rougeL_precision_stderr": 0.003680574837942309, "rougeL_recall": 0.03991105511182912, "rougeL_recall_stderr": 0.0016279952050045603, "rougeLsum_fmeasure": 0.051422475813845084, "rougeLsum_fmeasure_stderr": 0.0018931617381586926, "rougeLsum_precision": 0.10390247551362326, "rougeLsum_precision_stderr": 0.004006559051889402, "rougeLsum_recall": 0.04597031630948619, "rougeLsum_recall_stderr": 0.0018861168846922784}}, "5": {"tldr_en": {"bleu": 1.5486400435190646e-20, "bleu_stderr": 5.928974165985541e-18, "rouge1_fmeasure": 0.009250948371284507, "rouge1_fmeasure_stderr": 0.0009365636508956353, "rouge1_precision": 0.019024182319545703, "rouge1_precision_stderr": 0.0019567530452185085, "rouge1_recall": 0.00809080049652358, "rouge1_recall_stderr": 0.0008930277662354756, "rouge2_fmeasure": 0.0031062476201456015, "rouge2_fmeasure_stderr": 0.00043823448085888057, "rouge2_precision": 0.007878162042298584, "rouge2_precision_stderr": 0.0012119853940782482, "rouge2_recall": 0.0024981499124714754, "rouge2_recall_stderr": 0.00036249889299460995, "rougeL_fmeasure": 0.007784924334651645, "rougeL_fmeasure_stderr": 0.0007922903839285351, "rougeL_precision": 0.016781942931569222, "rougeL_precision_stderr": 0.0017964368358031513, "rougeL_recall": 0.006703878314586593, "rougeL_recall_stderr": 0.0007330971808896851, "rougeLsum_fmeasure": 0.008943947340808445, "rougeLsum_fmeasure_stderr": 0.0009064353361518755, "rougeLsum_precision": 0.01855296243342967, "rougeLsum_precision_stderr": 0.001923182050974051, "rougeLsum_recall": 0.00783323994370169, "rougeLsum_recall_stderr": 0.0008689584836341853}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.9626050306721474, "bleu_stderr": 0.06024253979194259, "rouge1_fmeasure": 0.20284230759257002, "rouge1_fmeasure_stderr": 0.0017464348006575089, "rouge1_precision": 0.1540985111881967, "rouge1_precision_stderr": 0.0014515620097711474, "rouge1_recall": 0.31140262883046665, "rouge1_recall_stderr": 0.0024251988602698067, "rouge2_fmeasure": 0.07031535968691954, "rouge2_fmeasure_stderr": 0.001168196298024764, "rouge2_precision": 0.05355018382472272, "rouge2_precision_stderr": 0.0009106172876128309, "rouge2_recall": 0.10690154622755875, "rouge2_recall_stderr": 0.0017688696499961142, "rougeL_fmeasure": 0.1753585977687643, "rougeL_fmeasure_stderr": 0.0014608296346256319, "rougeL_precision": 0.1327617315507999, "rougeL_precision_stderr": 0.001197611976138011, "rougeL_recall": 0.2711114650902915, "rougeL_recall_stderr": 0.0021100724050297824, "rougeLsum_fmeasure": 0.18282034804175978, "rougeLsum_fmeasure_stderr": 0.0016241979991028408, "rougeLsum_precision": 0.1387916405736721, "rougeLsum_precision_stderr": 0.0013406445609266567, "rougeLsum_recall": 0.2810609872616038, "rougeLsum_recall_stderr": 0.0022804790016154107}}, "1": {"generate_text_restaurant": {"bleu": 11.593827123577368, "bleu_stderr": 0.12407021076125696, "rouge1_fmeasure": 0.45020216196031243, "rouge1_fmeasure_stderr": 0.0023747076071133698, "rouge1_precision": 0.5509122991009537, "rouge1_precision_stderr": 0.0033661250504397264, "rouge1_recall": 0.42140078880413323, "rouge1_recall_stderr": 0.002953747532093766, "rouge2_fmeasure": 0.21317325143407617, "rouge2_fmeasure_stderr": 0.0020127889592610262, "rouge2_precision": 0.2650742428626904, "rouge2_precision_stderr": 0.0027402930606959166, "rouge2_recall": 0.19921132653073137, "rouge2_recall_stderr": 0.0021274437677356846, "rougeL_fmeasure": 0.32668549807617586, "rougeL_fmeasure_stderr": 0.002086318029186934, "rougeL_precision": 0.40311524575775876, "rougeL_precision_stderr": 0.003054424817440509, "rougeL_recall": 0.30479322858663915, "rougeL_recall_stderr": 0.0023980133198678024, "rougeLsum_fmeasure": 0.3686111369512851, "rougeLsum_fmeasure_stderr": 0.0023418416254223887, "rougeLsum_precision": 0.4523945212247963, "rougeLsum_precision_stderr": 0.00326815326036064, "rougeLsum_recall": 0.3446091324717908, "rougeLsum_recall_stderr": 0.0027191831013193444}}, "2": {"generate_text_restaurant": {"bleu": 12.88190096920724, "bleu_stderr": 0.17583922253090617, "rouge1_fmeasure": 0.4721264666575481, "rouge1_fmeasure_stderr": 0.002268547262390229, "rouge1_precision": 0.5822720419906148, "rouge1_precision_stderr": 0.0032962978807536585, "rouge1_recall": 0.4337280880183242, "rouge1_recall_stderr": 0.0028431977686949614, "rouge2_fmeasure": 0.23375071908985015, "rouge2_fmeasure_stderr": 0.0020523859536162883, "rouge2_precision": 0.2931157748619445, "rouge2_precision_stderr": 0.0028165108542885105, "rouge2_recall": 0.2147077046167617, "rouge2_recall_stderr": 0.0021786795843148085, "rougeL_fmeasure": 0.3522716641150115, "rougeL_fmeasure_stderr": 0.0020991364053741914, "rougeL_precision": 0.4369685674136242, "rougeL_precision_stderr": 0.003073517456588823, "rougeL_recall": 0.32297689774894583, "rougeL_recall_stderr": 0.0024097063140581243, "rougeLsum_fmeasure": 0.393682260751447, "rougeLsum_fmeasure_stderr": 0.0023007053568388975, "rougeLsum_precision": 0.4863687819740207, "rougeLsum_precision_stderr": 0.0032507530347318466, "rougeLsum_recall": 0.3613566974986443, "rougeLsum_recall_stderr": 0.0026568660208198115}}, "3": {"generate_text_restaurant": {"bleu": 13.262436270778784, "bleu_stderr": 0.12966668718803287, "rouge1_fmeasure": 0.47525930406845607, "rouge1_fmeasure_stderr": 0.0022663496963600087, "rouge1_precision": 0.5813922477147344, "rouge1_precision_stderr": 0.0032594641701596022, "rouge1_recall": 0.43750260743630337, "rouge1_recall_stderr": 0.002850951212768957, "rouge2_fmeasure": 0.238646665315093, "rouge2_fmeasure_stderr": 0.0020938647359093243, "rouge2_precision": 0.2963848070406136, "rouge2_precision_stderr": 0.002811561987291426, "rouge2_recall": 0.2198532494536229, "rouge2_recall_stderr": 0.0022399911777690423, "rougeL_fmeasure": 0.3550920935079866, "rougeL_fmeasure_stderr": 0.002168420115905195, "rougeL_precision": 0.43608943087396823, "rougeL_precision_stderr": 0.0030728003407875415, "rougeL_recall": 0.32653357863256616, "rougeL_recall_stderr": 0.002478214580278462, "rougeLsum_fmeasure": 0.3982122870175536, "rougeLsum_fmeasure_stderr": 0.002333698912202139, "rougeLsum_precision": 0.4873959951365985, "rougeLsum_precision_stderr": 0.0032309608357424142, "rougeLsum_recall": 0.3665706405925854, "rougeLsum_recall_stderr": 0.0027071065833861527}}, "4": {"generate_text_restaurant": {"bleu": 13.476662518674425, "bleu_stderr": 0.14804924963101374, "rouge1_fmeasure": 0.4751483029381261, "rouge1_fmeasure_stderr": 0.0022939871226639315, "rouge1_precision": 0.575148309756594, "rouge1_precision_stderr": 0.0032456743499240143, "rouge1_recall": 0.4379553251280355, "rouge1_recall_stderr": 0.0027875515895071923, "rouge2_fmeasure": 0.23937430569965518, "rouge2_fmeasure_stderr": 0.0021223157395609733, "rouge2_precision": 0.2932213385624628, "rouge2_precision_stderr": 0.0027850927611603377, "rouge2_recall": 0.220787156975807, "rouge2_recall_stderr": 0.0022308643708334754, "rougeL_fmeasure": 0.35385483221937813, "rougeL_fmeasure_stderr": 0.0021780339813240314, "rougeL_precision": 0.4297630033921168, "rougeL_precision_stderr": 0.0030250335936550877, "rougeL_recall": 0.3260216890650744, "rougeL_recall_stderr": 0.0024522487516979575, "rougeLsum_fmeasure": 0.3976384682503085, "rougeLsum_fmeasure_stderr": 0.0023729927223769785, "rougeLsum_precision": 0.4813876805218902, "rougeLsum_precision_stderr": 0.0032181540128894265, "rougeLsum_recall": 0.36675888786779215, "rougeLsum_recall_stderr": 0.002699461696539119}}, "5": {"generate_text_restaurant": {"bleu": 13.156897203455948, "bleu_stderr": 0.08630136341873869, "rouge1_fmeasure": 0.4729751649957838, "rouge1_fmeasure_stderr": 0.0022801925622949843, "rouge1_precision": 0.5738627336760422, "rouge1_precision_stderr": 0.0032894505336520264, "rouge1_recall": 0.4353618479598648, "rouge1_recall_stderr": 0.0027767731270488508, "rouge2_fmeasure": 0.23628094628078694, "rouge2_fmeasure_stderr": 0.00211214055033469, "rouge2_precision": 0.29086794940233046, "rouge2_precision_stderr": 0.002818639353471394, "rouge2_recall": 0.21741764823749096, "rouge2_recall_stderr": 0.002221377915423391, "rougeL_fmeasure": 0.3534318578391924, "rougeL_fmeasure_stderr": 0.0021797527155027353, "rougeL_precision": 0.4304412705229436, "rougeL_precision_stderr": 0.003073791049621951, "rougeL_recall": 0.32508975084283026, "rougeL_recall_stderr": 0.0024528402950547467, "rougeLsum_fmeasure": 0.39642244501779056, "rougeLsum_fmeasure_stderr": 0.002354679808047655, "rougeLsum_precision": 0.4815893698288744, "rougeLsum_precision_stderr": 0.0032662227256431515, "rougeLsum_recall": 0.36475434401700674, "rougeLsum_recall_stderr": 0.00266853275926591}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0285249742430023, "bleu_stderr": 0.04658561770880105, "rouge1_fmeasure": 0.1478842341216362, "rouge1_fmeasure_stderr": 0.0027472203051295764, "rouge1_precision": 0.10711262762227787, "rouge1_precision_stderr": 0.0020601915536585967, "rouge1_recall": 0.25337181483295146, "rouge1_recall_stderr": 0.0046856436696164905, "rouge2_fmeasure": 0.02780714664643612, "rouge2_fmeasure_stderr": 0.0012134748561574466, "rouge2_precision": 0.019781013833534398, "rouge2_precision_stderr": 0.0008722452654034713, "rouge2_recall": 0.04912277748197487, "rouge2_recall_stderr": 0.0021576559260432712, "rougeL_fmeasure": 0.11266163818049864, "rougeL_fmeasure_stderr": 0.0020299599299667603, "rougeL_precision": 0.08153762034631415, "rougeL_precision_stderr": 0.0015205040880351287, "rougeL_recall": 0.19377740656973594, "rougeL_recall_stderr": 0.0035212605780510554, "rougeLsum_fmeasure": 0.11968068511417429, "rougeLsum_fmeasure_stderr": 0.0021748845339722685, "rougeLsum_precision": 0.08657778420332544, "rougeLsum_precision_stderr": 0.0016263154802718866, "rougeLsum_recall": 0.20583552514369016, "rougeLsum_recall_stderr": 0.003771689144281502}}, "1": {"article_DOC_summary": {"bleu": 2.1792875082477123, "bleu_stderr": 0.06319420014636688, "rouge1_fmeasure": 0.2183934186796248, "rouge1_fmeasure_stderr": 0.0031131023395006203, "rouge1_precision": 0.2154872949099143, "rouge1_precision_stderr": 0.0037469904189591395, "rouge1_recall": 0.25867586586469277, "rouge1_recall_stderr": 0.004134512250176692, "rouge2_fmeasure": 0.04500583650450346, "rouge2_fmeasure_stderr": 0.001863892149786922, "rouge2_precision": 0.0447245429308555, "rouge2_precision_stderr": 0.0020714108653434506, "rouge2_recall": 0.05502827731187604, "rouge2_recall_stderr": 0.0023758194054523916, "rougeL_fmeasure": 0.1635144970491249, "rougeL_fmeasure_stderr": 0.0024820064917382306, "rougeL_precision": 0.1614628963956234, "rougeL_precision_stderr": 0.0030120325913654804, "rougeL_recall": 0.19472956411902964, "rougeL_recall_stderr": 0.0033235017592203077, "rougeLsum_fmeasure": 0.1671666603817299, "rougeLsum_fmeasure_stderr": 0.0025413985087851003, "rougeLsum_precision": 0.16439844895006708, "rougeLsum_precision_stderr": 0.003021699034691555, "rougeLsum_recall": 0.2002131667137321, "rougeLsum_recall_stderr": 0.003522860713518912}}, "2": {"article_DOC_summary": {"bleu": 2.9719119603418074, "bleu_stderr": 0.16832095088560176, "rouge1_fmeasure": 0.23579050025596546, "rouge1_fmeasure_stderr": 0.0032208877086972645, "rouge1_precision": 0.24578221078853943, "rouge1_precision_stderr": 0.0038311545433087246, "rouge1_recall": 0.25247232169607386, "rouge1_recall_stderr": 0.003851791759690215, "rouge2_fmeasure": 0.051882326943507785, "rouge2_fmeasure_stderr": 0.002120030804437139, "rouge2_precision": 0.05385976155670831, "rouge2_precision_stderr": 0.002293715583911734, "rouge2_recall": 0.05655894066313464, "rouge2_recall_stderr": 0.002406489537857907, "rougeL_fmeasure": 0.17839101172174274, "rougeL_fmeasure_stderr": 0.0026918945920387577, "rougeL_precision": 0.18601506569279253, "rougeL_precision_stderr": 0.0031620641585856746, "rougeL_recall": 0.19159840411716447, "rougeL_recall_stderr": 0.003222530909275831, "rougeLsum_fmeasure": 0.18050054360244336, "rougeLsum_fmeasure_stderr": 0.002709070917463213, "rougeLsum_precision": 0.18783807621807994, "rougeLsum_precision_stderr": 0.003156835477899629, "rougeLsum_recall": 0.19452321319767416, "rougeLsum_recall_stderr": 0.0033253733370942227}}, "3": {"article_DOC_summary": {"bleu": 3.2041870719788714, "bleu_stderr": 0.20611661341919532, "rouge1_fmeasure": 0.2277409702430376, "rouge1_fmeasure_stderr": 0.0036665816219810165, "rouge1_precision": 0.24477878269919434, "rouge1_precision_stderr": 0.0043304256076557015, "rouge1_recall": 0.23546035253406764, "rouge1_recall_stderr": 0.0040420090423460295, "rouge2_fmeasure": 0.05187808695380671, "rouge2_fmeasure_stderr": 0.0022736997047107283, "rouge2_precision": 0.05535540959023755, "rouge2_precision_stderr": 0.002520852290920342, "rouge2_recall": 0.05400488265902618, "rouge2_recall_stderr": 0.0023942103295597683, "rougeL_fmeasure": 0.1726765296689622, "rougeL_fmeasure_stderr": 0.0030037675906149446, "rougeL_precision": 0.18676731153569484, "rougeL_precision_stderr": 0.0035994106495257387, "rougeL_recall": 0.17809432971388495, "rougeL_recall_stderr": 0.003241778335625792, "rougeLsum_fmeasure": 0.1742948037879305, "rougeLsum_fmeasure_stderr": 0.0030227160469161974, "rougeLsum_precision": 0.1881533844870454, "rougeLsum_precision_stderr": 0.003601435982381053, "rougeLsum_recall": 0.18040945162861388, "rougeLsum_recall_stderr": 0.0033378628217040685}}, "4": {"article_DOC_summary": {"bleu": 0.114133025618453, "bleu_stderr": 0.03902166340342278, "rouge1_fmeasure": 0.05638135159346031, "rouge1_fmeasure_stderr": 0.0033862796262482922, "rouge1_precision": 0.06536803730199292, "rouge1_precision_stderr": 0.0040080446277792475, "rouge1_recall": 0.0562578647719299, "rouge1_recall_stderr": 0.003466710519052521, "rouge2_fmeasure": 0.01218356666574454, "rouge2_fmeasure_stderr": 0.0012325430366165824, "rouge2_precision": 0.013453387419618108, "rouge2_precision_stderr": 0.0014082914388032289, "rouge2_recall": 0.012484556808642255, "rouge2_recall_stderr": 0.0012859869239756115, "rougeL_fmeasure": 0.042197213021247744, "rougeL_fmeasure_stderr": 0.0025684294584781297, "rougeL_precision": 0.049695294130091966, "rougeL_precision_stderr": 0.0031067522139507655, "rougeL_recall": 0.04191724945748147, "rougeL_recall_stderr": 0.0026229390757436582, "rougeLsum_fmeasure": 0.042656324273577836, "rougeLsum_fmeasure_stderr": 0.002590374442922286, "rougeLsum_precision": 0.0501407378152416, "rougeLsum_precision_stderr": 0.00312422462775062, "rougeLsum_recall": 0.04244935232602974, "rougeLsum_recall_stderr": 0.0026544789695008624}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..f66d640fcd36cf72d21df2ec8c20e601391367f8 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.2562657244023514, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.028344931968085282 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.062384701456087446, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020959572066061704 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.2778238258177499, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0053586090146941585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.08917647320322222, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001969587935532424 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.026957871325524504, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00119791534895436 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.12120656565208492, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003245668709085193 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.03885973727074715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011411233576479184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.059948665447718395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001993918360545074 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2696777384075072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005223094399514082 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08588431201368854, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018539010477647254 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.05914169795430463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020593373696654797 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.25838316242026105, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00491849430291812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.08392617035294098, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001881634203344931 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..f84750876ce8420c0a309fcc42d25404ed91b2a9 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.44906636509622533, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.039450588630595015 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.11288061517490074, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00386994874457578 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.27968139569340955, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004818798730073509 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13316786961188862, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0032035615287408364 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.052903771126418274, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0024560083140397187 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.13670929059101208, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032752111527625427 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0635687916821055, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0020746952988998694 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.10294903334837356, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0034785078085129637 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.26443193955147987, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004508721603501755 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12246686808714284, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0027865982212275067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.1043770278231783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0035484629358203635 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2648398336113232, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004503000799632382 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12355055054182587, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002841014595626855 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5ae49c94ac1f7e59629792dc7b6c5544f286f5cf --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4687852352128821, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037809750600987985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.12939788800380767, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004056238000462917 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3205721940394599, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004784451352989951 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.15485025651871712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0034302178802564512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.06272684866048131, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002451752863709562 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16040449198937512, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003451151403396264 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.07545945954698206, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0021937365311268525 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.11538953087200472, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00346589668028597 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30175228398138576, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004432821529779081 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.14062442945039813, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0028845693981807787 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.11767324801231895, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0035688465184641432 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3033452610052125, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004447592173686305 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.14250354572951518, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002959772587239683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ee67a8331ae0579f811dcb9442bfee314595c6ca --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5387522714694768, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.023700066593108533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.14028377513050538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004341036446200867 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.33808480225227105, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004840897586860907 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1660401631118107, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0037496181550606754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.07249658632299814, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0029199126351436076 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1738659001719329, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003647562909324833 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.08448456167004659, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.002619780755266116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.12529450659786737, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003740919506421422 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3182618579312013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004525730253384199 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.15093871662748365, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0032225789949202046 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.12866365606890057, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003897701131855631 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3209266755398485, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004550196873941636 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.15379137375695537, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0033296076273151513 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0dd2508bb2ef152493fbfe017d4df44bb33ddc27 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6154728892401495, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05492052153457257 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.1416216780254008, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004403560474088744 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.350369181874358, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004821260815960307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.16918194985742865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003726050287797417 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.07377427634131609, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0029769913238928973 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18193245830782417, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003657633632618109 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.08639654396535042, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0025466510150414677 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.12691040102952247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0038030608678551996 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33016396642156515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004483672398798779 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.15419230913013102, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0031966205747524113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.1301329876623986, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003962229410505262 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3326898264482343, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004514627367420683 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.15686161901707832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003303387308503824 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c51e9124726e2fc64dafbbe40587856eabc17612 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6280080728696126, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04207890973166607 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.15509132478375567, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004705843597864089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36072408074578627, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004932090632111422 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1785781279253249, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.003864561091208021 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.08180998073756966, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0031797028106942695 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1895120629191651, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037580265385154963 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.09234282351104058, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0026995310439716595 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.13813720543380648, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004053616600136467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33922496409575775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004622505715509403 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1620344936907299, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.003324512248892316 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.14168764741861828, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004239364453934911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.34078189394441194, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004625313010789047 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.16446961325308998, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0034245269422962608 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..effb976d508e077c2223ef9eb32da13b24ab1dc2 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.14831832571904127, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0020031574591708096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.23669379639579727, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002619263402445374 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.16843326604027012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018787988149176356 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.027495279137204204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008008675778483577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.04558631579219217, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013034624565288761 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03124497623150672, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000826359540629872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11944058601706234, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014613554843883904 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.197662087047554, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022098401272736727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1373698337101501, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014061340796650912 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13662513471708013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001834224913870075 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21848967035350617, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00242500324850602 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15530559329563356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017277092690194619 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.4416513225274035, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07150581722616807 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..a549d7daebca055f90284836950bad7b0ebb45ae --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2474989333475308, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004192775409890903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.16589702996724112, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002477684211999076 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1595930665634592, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020260063889951897 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06456439217534583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0026036519415425876 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.03395563608028862, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011867665512589468 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03442090774378983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011197085684525982 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.2007100384592662, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0036402244436710527 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.12968060330920403, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0019100623389590725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12543184498335516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015786460435038327 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2356687196157363, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004040097148447038 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.15692818247747958, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023149233432171795 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15113313197846198, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019006022698345565 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.9112541973999004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08163120654641273 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..cbc23f38f5e0c730066097a7d95ab09ab1449566 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.3671552578777415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00464310641179873 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.18316424412100443, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002534818661881543 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20013093911819188, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021449192871676984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.11652986149569292, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0033165278944766326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.048557963351484004, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012897480876968325 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05444986552431719, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012835369876590637 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.3030100733810117, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004160446544180668 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1454519391726235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020102539956042787 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1600577177279773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017194821563251488 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.34948494261958846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00452044298551405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.17241597783925564, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0023681037127358745 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18899973850835908, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020281161914398288 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.4920293719291493, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06820740373159184 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1742a9bda1ed3552655d6dc7dcdb569379b7c44d --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.31853023306817924, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.005101746577027814 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.14722792301810253, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026467115286204013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.16433308908585867, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002402134739956357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.10260882729326853, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0033488070271773295 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.04037648707007591, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012615029724490608 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04583954485462417, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012897030660356447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.26690611784240603, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004530048771512807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.11874183155575303, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021251987841602225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13364146877613045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00195553742839865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.3046492013868888, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004949401564936648 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.1393491071946238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024976587731928646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15587618164528091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002273493914941302 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.6055821821151304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06565825928935355 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4ce0f01a20660cae2a50a11cc15ca1c1d1f226 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.10914047578759525, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.004168163234199949 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.04875414041264775, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0020018963702338544 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05455257649733116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002012404693346845 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.037261877113724175, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.002437336913811512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.013448412424643126, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0008658300088105147 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.01577378906241384, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009385589100462621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.09325777757588789, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003680574837942309 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.03991105511182912, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0016279952050045603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.045100032510421204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016666399875103265 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.10390247551362326, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004006559051889402 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.04597031630948619, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0018861168846922784 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.051422475813845084, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018931617381586926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.006932560692974423, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0017990239865701577 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..61b0a76c8ae322ff5b169ebc9cc3aae433ccf084 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.019024182319545703, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019567530452185085 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.00809080049652358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0008930277662354756 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009250948371284507, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0009365636508956353 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.007878162042298584, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012119853940782482 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0024981499124714754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00036249889299460995 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0031062476201456015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00043823448085888057 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.016781942931569222, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017964368358031513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.006703878314586593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0007330971808896851 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007784924334651645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007922903839285351 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.01855296243342967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001923182050974051 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.00783323994370169, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0008689584836341853 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008943947340808445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0009064353361518755 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.5486400435190646e-20, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 5.928974165985541e-18 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9b3a5dbe8c1b0ad4f5eb80b417f31411772f0cc7 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.06536803730199292, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0040080446277792475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0562578647719299, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.003466710519052521 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05638135159346031, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0033862796262482922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.013453387419618108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0014082914388032289 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.012484556808642255, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0012859869239756115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01218356666574454, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0012325430366165824 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.049695294130091966, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0031067522139507655 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.04191724945748147, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0026229390757436582 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.042197213021247744, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0025684294584781297 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0501407378152416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00312422462775062 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.04244935232602974, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0026544789695008624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.042656324273577836, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002590374442922286 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.114133025618453, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.03902166340342278 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..69629d02d9fb3e58d19a500f27832b84be9a10f0 --- /dev/null +++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv new file mode 100644 index 0000000000000000000000000000000000000000..9a783151c99e2fb57fd6bed4ac0a4a64dd09a014 --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.322,0.014782913600996673,0 +anli_r2,acc,0.331,0.014888272588203936,0 +anli_r3,acc,0.3358333333333333,0.013639261190932887,0 +arc_challenge,acc,0.2380546075085324,0.012445770028026208,0 +arc_challenge,acc_norm,0.2627986348122867,0.012862523175351335,0 +arc_easy,acc,0.5513468013468014,0.010205540414612862,0 +arc_easy,acc_norm,0.49326599326599324,0.010258852980991825,0 +boolq,acc,0.5850152905198777,0.008617716361921567,1 +cb,acc,0.35714285714285715,0.0646095738380922,1 +cb,f1,0.24888576120103215,,1 +copa,acc,0.68,0.046882617226215034,0 +hellaswag,acc,0.38169687313284206,0.004848099661619686,0 +hellaswag,acc_norm,0.47689703246365267,0.004984452002563925,0 +piqa,acc,0.7121871599564744,0.01056325038305919,0 +piqa,acc_norm,0.7094668117519043,0.010592765034696534,0 +rte,acc,0.5270758122743683,0.030052303463143706,0 +sciq,acc,0.804,0.012559527926707378,0 +sciq,acc_norm,0.722,0.014174516461485247,0 +storycloze_2016,acc,0.6654195617316943,0.01091131896712794,0 +winogrande,acc,0.5138121546961326,0.014047122916440415,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json index d743475ed42e665cc658484b0fffc490d74d9170..8a11443b4b309ef0fcdb477171d9a075d2f44646 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.68, "acc_stderr": 0.046882617226215034 + }, + "hellaswag": { + "acc": 0.38169687313284206, + "acc_stderr": 0.004848099661619686, + "acc_norm": 0.47689703246365267, + "acc_norm_stderr": 0.004984452002563925 + }, + "rte": { + "acc": 0.5270758122743683, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.5138121546961326, + "acc_stderr": 0.014047122916440415 + }, + "storycloze_2016": { + "acc": 0.6654195617316943, + "acc_stderr": 0.01091131896712794 + }, + "boolq": { + "acc": 0.5850152905198777, + "acc_stderr": 0.008617716361921567 + }, + "arc_easy": { + "acc": 0.5513468013468014, + "acc_stderr": 0.010205540414612862, + "acc_norm": 0.49326599326599324, + "acc_norm_stderr": 0.010258852980991825 + }, + "arc_challenge": { + "acc": 0.2380546075085324, + "acc_stderr": 0.012445770028026208, + "acc_norm": 0.2627986348122867, + "acc_norm_stderr": 0.012862523175351335 + }, + "sciq": { + "acc": 0.804, + "acc_stderr": 0.012559527926707378, + "acc_norm": 0.722, + "acc_norm_stderr": 0.014174516461485247 + }, + "piqa": { + "acc": 0.7121871599564744, + "acc_stderr": 0.01056325038305919, + "acc_norm": 0.7094668117519043, + "acc_norm_stderr": 0.010592765034696534 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json deleted file mode 100644 index d743475ed42e665cc658484b0fffc490d74d9170..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.322, - "acc_stderr": 0.014782913600996673 - }, - "anli_r2": { - "acc": 0.331, - "acc_stderr": 0.014888272588203936 - }, - "anli_r3": { - "acc": 0.3358333333333333, - "acc_stderr": 0.013639261190932887 - }, - "cb": { - "acc": 0.35714285714285715, - "acc_stderr": 0.0646095738380922, - "f1": 0.24888576120103215 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.046882617226215034 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv new file mode 100644 index 0000000000000000000000000000000000000000..ba5877a90a5786fc5a06117d5d75a35c4e2bf3c3 --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.303,0.014539683710535255,0 +anli_r2,acc,0.319,0.014746404865473486,0 +anli_r3,acc,0.33416666666666667,0.013622434813136781,0 +arc_challenge,acc,0.2431740614334471,0.012536554144587092,0 +arc_challenge,acc_norm,0.28071672354948807,0.013131238126975583,0 +arc_easy,acc,0.5631313131313131,0.010177672928157681,0 +arc_easy,acc_norm,0.5273569023569024,0.010244415164390529,0 +boolq,acc,0.5813455657492355,0.008628545022868554,1 +cb,acc,0.48214285714285715,0.06737697508644648,1 +cb,f1,0.3270348837209302,,1 +copa,acc,0.66,0.04760952285695237,0 +hellaswag,acc,0.3815972913762199,0.004847857546957478,0 +hellaswag,acc_norm,0.477096195976897,0.004984543540932335,0 +piqa,acc,0.7078346028291621,0.010610252174513658,0 +piqa,acc_norm,0.6996735582154516,0.010695225308183145,0 +rte,acc,0.5306859205776173,0.03003973059219781,0 +sciq,acc,0.871,0.010605256784796565,0 +sciq,acc_norm,0.861,0.010945263761042963,0 +storycloze_2016,acc,0.655264564404062,0.01099083028205749,0 +winogrande,acc,0.5414364640883977,0.0140041468537919,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json index 1d45ddcd822d1d31b6bd83c272c1975e920a8c59..d0f4fde80498d2b81785a034bd67447b961ac856 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.66, "acc_stderr": 0.04760952285695237 + }, + "hellaswag": { + "acc": 0.3815972913762199, + "acc_stderr": 0.004847857546957478, + "acc_norm": 0.477096195976897, + "acc_norm_stderr": 0.004984543540932335 + }, + "rte": { + "acc": 0.5306859205776173, + "acc_stderr": 0.03003973059219781 + }, + "winogrande": { + "acc": 0.5414364640883977, + "acc_stderr": 0.0140041468537919 + }, + "storycloze_2016": { + "acc": 0.655264564404062, + "acc_stderr": 0.01099083028205749 + }, + "boolq": { + "acc": 0.5813455657492355, + "acc_stderr": 0.008628545022868554 + }, + "arc_easy": { + "acc": 0.5631313131313131, + "acc_stderr": 0.010177672928157681, + "acc_norm": 0.5273569023569024, + "acc_norm_stderr": 0.010244415164390529 + }, + "arc_challenge": { + "acc": 0.2431740614334471, + "acc_stderr": 0.012536554144587092, + "acc_norm": 0.28071672354948807, + "acc_norm_stderr": 0.013131238126975583 + }, + "sciq": { + "acc": 0.871, + "acc_stderr": 0.010605256784796565, + "acc_norm": 0.861, + "acc_norm_stderr": 0.010945263761042963 + }, + "piqa": { + "acc": 0.7078346028291621, + "acc_stderr": 0.010610252174513658, + "acc_norm": 0.6996735582154516, + "acc_norm_stderr": 0.010695225308183145 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json deleted file mode 100644 index 1d45ddcd822d1d31b6bd83c272c1975e920a8c59..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.303, - "acc_stderr": 0.014539683710535255 - }, - "anli_r2": { - "acc": 0.319, - "acc_stderr": 0.014746404865473486 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.013622434813136781 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.06737697508644648, - "f1": 0.3270348837209302 - }, - "copa": { - "acc": 0.66, - "acc_stderr": 0.04760952285695237 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv new file mode 100644 index 0000000000000000000000000000000000000000..9ae892f5463c3edbcd6e22e2840c92a7a4f03547 --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.311,0.014645596385722694,0 +anli_r2,acc,0.346,0.01505026612756444,0 +anli_r3,acc,0.325,0.013526454480351014,0 +arc_challenge,acc,0.24573378839590443,0.012581033453730114,0 +arc_challenge,acc_norm,0.27986348122866894,0.013119040897725922,0 +arc_easy,acc,0.5744949494949495,0.010145271182591021,0 +arc_easy,acc_norm,0.5471380471380471,0.010214087372211392,0 +boolq,acc,0.5629969418960244,0.008675365793227084,1 +cb,acc,0.4107142857142857,0.0663363415035954,1 +cb,f1,0.26927814732692784,,1 +copa,acc,0.68,0.04688261722621505,0 +hellaswag,acc,0.38149770961959767,0.00484761521647344,0 +hellaswag,acc_norm,0.4757020513841864,0.004983886091690525,0 +piqa,acc,0.7154515778019587,0.010527218464130605,0 +piqa,acc_norm,0.7105549510337323,0.01058101474067561,0 +rte,acc,0.5379061371841155,0.03000984891252912,0 +sciq,acc,0.885,0.010093407594904628,0 +sciq,acc_norm,0.88,0.010281328012747384,0 +storycloze_2016,acc,0.6547300908605024,0.010994860223187675,0 +winogrande,acc,0.5272296764009471,0.014031631629827696,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json index d081bc3b334843d38818453d028bc3987662ad70..c4d124763162d14a87ad9d0e0d4c23ca1cd2efc9 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.68, "acc_stderr": 0.04688261722621505 + }, + "hellaswag": { + "acc": 0.38149770961959767, + "acc_stderr": 0.00484761521647344, + "acc_norm": 0.4757020513841864, + "acc_norm_stderr": 0.004983886091690525 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.03000984891252912 + }, + "winogrande": { + "acc": 0.5272296764009471, + "acc_stderr": 0.014031631629827696 + }, + "storycloze_2016": { + "acc": 0.6547300908605024, + "acc_stderr": 0.010994860223187675 + }, + "boolq": { + "acc": 0.5629969418960244, + "acc_stderr": 0.008675365793227084 + }, + "arc_easy": { + "acc": 0.5744949494949495, + "acc_stderr": 0.010145271182591021, + "acc_norm": 0.5471380471380471, + "acc_norm_stderr": 0.010214087372211392 + }, + "arc_challenge": { + "acc": 0.24573378839590443, + "acc_stderr": 0.012581033453730114, + "acc_norm": 0.27986348122866894, + "acc_norm_stderr": 0.013119040897725922 + }, + "sciq": { + "acc": 0.885, + "acc_stderr": 0.010093407594904628, + "acc_norm": 0.88, + "acc_norm_stderr": 0.010281328012747384 + }, + "piqa": { + "acc": 0.7154515778019587, + "acc_stderr": 0.010527218464130605, + "acc_norm": 0.7105549510337323, + "acc_norm_stderr": 0.01058101474067561 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json deleted file mode 100644 index d081bc3b334843d38818453d028bc3987662ad70..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.311, - "acc_stderr": 0.014645596385722694 - }, - "anli_r2": { - "acc": 0.346, - "acc_stderr": 0.01505026612756444 - }, - "anli_r3": { - "acc": 0.325, - "acc_stderr": 0.013526454480351014 - }, - "cb": { - "acc": 0.4107142857142857, - "acc_stderr": 0.0663363415035954, - "f1": 0.26927814732692784 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621505 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv new file mode 100644 index 0000000000000000000000000000000000000000..1e192117c28be80ce1cb36706c50622a3ce92381 --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.316,0.014709193056057134,0 +anli_r2,acc,0.344,0.015029633724408945,0 +anli_r3,acc,0.335,0.013630871843821482,0 +arc_challenge,acc,0.23720136518771331,0.012430399829260844,0 +arc_challenge,acc_norm,0.2883959044368601,0.01323839442242817,0 +arc_easy,acc,0.5698653198653199,0.010159130445178502,0 +arc_easy,acc_norm,0.5517676767676768,0.010204645126856942,0 +boolq,acc,0.5660550458715596,0.008668405003744127,1 +cb,acc,0.48214285714285715,0.0673769750864465,1 +cb,f1,0.32495309568480296,,1 +copa,acc,0.68,0.04688261722621504,0 +hellaswag,acc,0.38179645488946423,0.004848341560492138,0 +hellaswag,acc_norm,0.4785899223262298,0.004985204766555062,0 +piqa,acc,0.7187159956474428,0.010490509832327423,0 +piqa,acc_norm,0.7127312295973884,0.010557291761528637,0 +rte,acc,0.5054151624548736,0.030094698123239966,0 +sciq,acc,0.882,0.010206869264381791,0 +sciq,acc_norm,0.879,0.010318210380946097,0 +storycloze_2016,acc,0.6541956173169428,0.010998874799044323,0 +winogrande,acc,0.5288082083662194,0.014029141615909617,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json index 97521dea31f26287cb5e405d1e108098872f9deb..bb08b35b8b95430d58ebd882b84d5709f210d88d 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.68, "acc_stderr": 0.04688261722621504 + }, + "hellaswag": { + "acc": 0.38179645488946423, + "acc_stderr": 0.004848341560492138, + "acc_norm": 0.4785899223262298, + "acc_norm_stderr": 0.004985204766555062 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.030094698123239966 + }, + "winogrande": { + "acc": 0.5288082083662194, + "acc_stderr": 0.014029141615909617 + }, + "storycloze_2016": { + "acc": 0.6541956173169428, + "acc_stderr": 0.010998874799044323 + }, + "boolq": { + "acc": 0.5660550458715596, + "acc_stderr": 0.008668405003744127 + }, + "arc_easy": { + "acc": 0.5698653198653199, + "acc_stderr": 0.010159130445178502, + "acc_norm": 0.5517676767676768, + "acc_norm_stderr": 0.010204645126856942 + }, + "arc_challenge": { + "acc": 0.23720136518771331, + "acc_stderr": 0.012430399829260844, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.01323839442242817 + }, + "sciq": { + "acc": 0.882, + "acc_stderr": 0.010206869264381791, + "acc_norm": 0.879, + "acc_norm_stderr": 0.010318210380946097 + }, + "piqa": { + "acc": 0.7187159956474428, + "acc_stderr": 0.010490509832327423, + "acc_norm": 0.7127312295973884, + "acc_norm_stderr": 0.010557291761528637 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json deleted file mode 100644 index 97521dea31f26287cb5e405d1e108098872f9deb..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.316, - "acc_stderr": 0.014709193056057134 - }, - "anli_r2": { - "acc": 0.344, - "acc_stderr": 0.015029633724408945 - }, - "anli_r3": { - "acc": 0.335, - "acc_stderr": 0.013630871843821482 - }, - "cb": { - "acc": 0.48214285714285715, - "acc_stderr": 0.0673769750864465, - "f1": 0.32495309568480296 - }, - "copa": { - "acc": 0.68, - "acc_stderr": 0.04688261722621504 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv new file mode 100644 index 0000000000000000000000000000000000000000..7a2243ef0b1887e20cfd4e3ddea65497a2fe5130 --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.34,0.014987482264363935,0 +anli_r2,acc,0.339,0.01497675877162034,0 +anli_r3,acc,0.33416666666666667,0.01362243481313678,0 +arc_challenge,acc,0.24061433447098976,0.012491468532390573,0 +arc_challenge,acc_norm,0.27303754266211605,0.013019332762635743,0 +arc_easy,acc,0.5732323232323232,0.010149141043955636,0 +arc_easy,acc_norm,0.5589225589225589,0.010188293221040564,0 +boolq,acc,0.5519877675840978,0.008697655510897233,1 +cb,acc,0.4642857142857143,0.06724777654937658,1 +cb,f1,0.3085858585858586,,1 +copa,acc,0.73,0.0446196043338474,0 +hellaswag,acc,0.38149770961959767,0.0048476152164734386,0 +hellaswag,acc_norm,0.47938657637920734,0.004985539159783411,0 +piqa,acc,0.7067464635473341,0.010621818421101924,0 +piqa,acc_norm,0.704570184983678,0.010644731559342467,0 +rte,acc,0.5090252707581228,0.030091559826331334,0 +sciq,acc,0.899,0.00953361892934102,0 +sciq,acc_norm,0.902,0.00940661918462123,0 +storycloze_2016,acc,0.6536611437733832,0.01100287402644642,0 +winogrande,acc,0.5295974743488555,0.014027843827840086,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json index 54d014eb10da0c80fac0ac75c3fc41fcef7b7980..58b11ab206673db71a0b7816c7254f8c164c01ba 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json @@ -20,6 +20,52 @@ "copa": { "acc": 0.73, "acc_stderr": 0.0446196043338474 + }, + "hellaswag": { + "acc": 0.38149770961959767, + "acc_stderr": 0.0048476152164734386, + "acc_norm": 0.47938657637920734, + "acc_norm_stderr": 0.004985539159783411 + }, + "rte": { + "acc": 0.5090252707581228, + "acc_stderr": 0.030091559826331334 + }, + "winogrande": { + "acc": 0.5295974743488555, + "acc_stderr": 0.014027843827840086 + }, + "storycloze_2016": { + "acc": 0.6536611437733832, + "acc_stderr": 0.01100287402644642 + }, + "boolq": { + "acc": 0.5519877675840978, + "acc_stderr": 0.008697655510897233 + }, + "arc_easy": { + "acc": 0.5732323232323232, + "acc_stderr": 0.010149141043955636, + "acc_norm": 0.5589225589225589, + "acc_norm_stderr": 0.010188293221040564 + }, + "arc_challenge": { + "acc": 0.24061433447098976, + "acc_stderr": 0.012491468532390573, + "acc_norm": 0.27303754266211605, + "acc_norm_stderr": 0.013019332762635743 + }, + "sciq": { + "acc": 0.899, + "acc_stderr": 0.00953361892934102, + "acc_norm": 0.902, + "acc_norm_stderr": 0.00940661918462123 + }, + "piqa": { + "acc": 0.7067464635473341, + "acc_stderr": 0.010621818421101924, + "acc_norm": 0.704570184983678, + "acc_norm_stderr": 0.010644731559342467 } }, "versions": { @@ -27,6 +73,15 @@ "anli_r2": 0, "anli_r3": 0, "cb": 1, - "copa": 0 + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json deleted file mode 100644 index 54d014eb10da0c80fac0ac75c3fc41fcef7b7980..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json +++ /dev/null @@ -1,32 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.34, - "acc_stderr": 0.014987482264363935 - }, - "anli_r2": { - "acc": 0.339, - "acc_stderr": 0.01497675877162034 - }, - "anli_r3": { - "acc": 0.33416666666666667, - "acc_stderr": 0.01362243481313678 - }, - "cb": { - "acc": 0.4642857142857143, - "acc_stderr": 0.06724777654937658, - "f1": 0.3085858585858586 - }, - "copa": { - "acc": 0.73, - "acc_stderr": 0.0446196043338474 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0, - "anli_r3": 0, - "cb": 1, - "copa": 0 - } -} \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv new file mode 100644 index 0000000000000000000000000000000000000000..df7e4a65b658b9bfc344204a72368adee3feadca --- /dev/null +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv @@ -0,0 +1,21 @@ +task,metric,value,err,version +anli_r1,acc,0.339,0.014976758771620342,0 +anli_r2,acc,0.347,0.015060472031706615,0 +anli_r3,acc,0.3425,0.013704669762934722,0 +arc_challenge,acc,0.2551194539249147,0.012739038695202102,0 +arc_challenge,acc_norm,0.26621160409556316,0.012915774781523214,0 +arc_easy,acc,0.5711279461279462,0.010155440652900154,0 +arc_easy,acc_norm,0.5467171717171717,0.010214901516731609,0 +boolq,acc,0.5547400611620795,0.008692488322023063,1 +cb,acc,0.375,0.06527912098338669,1 +cb,f1,0.2429169746242917,,1 +copa,acc,0.7,0.046056618647183814,0 +hellaswag,acc,0.3819956184027086,0.004848824710995933,0 +hellaswag,acc_norm,0.483469428400717,0.0049870536525402735,0 +piqa,acc,0.6953210010881393,0.010738889044325161,0 +piqa,acc_norm,0.7029379760609358,0.010661725404814783,0 +rte,acc,0.5631768953068592,0.02985524739031494,0 +sciq,acc,0.899,0.009533618929340995,0 +sciq,acc_norm,0.903,0.009363689373248121,0 +storycloze_2016,acc,0.6606092998396579,0.010949682016358629,0 +winogrande,acc,0.526440410418311,0.014032823874407225,0 diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json index 01defb03f69355c1ce57dae755114fd90728e8ab..2377bf3a2d94368d8b1042c07d3a55a146f0809d 100644 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json +++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json @@ -7,10 +7,81 @@ "anli_r2": { "acc": 0.347, "acc_stderr": 0.015060472031706615 + }, + "anli_r3": { + "acc": 0.3425, + "acc_stderr": 0.013704669762934722 + }, + "cb": { + "acc": 0.375, + "acc_stderr": 0.06527912098338669, + "f1": 0.2429169746242917 + }, + "copa": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814 + }, + "hellaswag": { + "acc": 0.3819956184027086, + "acc_stderr": 0.004848824710995933, + "acc_norm": 0.483469428400717, + "acc_norm_stderr": 0.0049870536525402735 + }, + "rte": { + "acc": 0.5631768953068592, + "acc_stderr": 0.02985524739031494 + }, + "winogrande": { + "acc": 0.526440410418311, + "acc_stderr": 0.014032823874407225 + }, + "storycloze_2016": { + "acc": 0.6606092998396579, + "acc_stderr": 0.010949682016358629 + }, + "boolq": { + "acc": 0.5547400611620795, + "acc_stderr": 0.008692488322023063 + }, + "arc_easy": { + "acc": 0.5711279461279462, + "acc_stderr": 0.010155440652900154, + "acc_norm": 0.5467171717171717, + "acc_norm_stderr": 0.010214901516731609 + }, + "arc_challenge": { + "acc": 0.2551194539249147, + "acc_stderr": 0.012739038695202102, + "acc_norm": 0.26621160409556316, + "acc_norm_stderr": 0.012915774781523214 + }, + "sciq": { + "acc": 0.899, + "acc_stderr": 0.009533618929340995, + "acc_norm": 0.903, + "acc_norm_stderr": 0.009363689373248121 + }, + "piqa": { + "acc": 0.6953210010881393, + "acc_stderr": 0.010738889044325161, + "acc_norm": 0.7029379760609358, + "acc_norm_stderr": 0.010661725404814783 } }, "versions": { "anli_r1": 0, - "anli_r2": 0 + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 } } \ No newline at end of file diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json deleted file mode 100644 index 01defb03f69355c1ce57dae755114fd90728e8ab..0000000000000000000000000000000000000000 --- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "results": { - "anli_r1": { - "acc": 0.339, - "acc_stderr": 0.014976758771620342 - }, - "anli_r2": { - "acc": 0.347, - "acc_stderr": 0.015060472031706615 - } - }, - "versions": { - "anli_r1": 0, - "anli_r2": 0 - } -} \ No newline at end of file