diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..17f13a1ed44946e2a4b73d3349c49e0c066629e2
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.2562657244023514, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.028344931968085282}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.062384701456087446, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020959572066061704}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.2778238258177499, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0053586090146941585}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.08917647320322222, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.001969587935532424}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.026957871325524504, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00119791534895436}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.12120656565208492, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003245668709085193}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.03885973727074715, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011411233576479184}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.059948665447718395, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001993918360545074}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.2696777384075072, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.005223094399514082}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.08588431201368854, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0018539010477647254}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.05914169795430463, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0020593373696654797}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.25838316242026105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00491849430291812}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.08392617035294098, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.001881634203344931}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..1c4e1615865df649dad080268b80baa90753835a
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.44906636509622533, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.039450588630595015}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.11288061517490074, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00386994874457578}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.27968139569340955, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004818798730073509}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.13316786961188862, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0032035615287408364}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.052903771126418274, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0024560083140397187}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.13670929059101208, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0032752111527625427}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.0635687916821055, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020746952988998694}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.10294903334837356, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0034785078085129637}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.26443193955147987, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004508721603501755}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.12246686808714284, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0027865982212275067}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1043770278231783, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035484629358203635}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.2648398336113232, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004503000799632382}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.12355055054182587, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002841014595626855}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..bc8a4d7237e68e1304a4fe013f4f8e7711d26c00
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.4687852352128821, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.037809750600987985}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.12939788800380767, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004056238000462917}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.3205721940394599, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004784451352989951}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.15485025651871712, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0034302178802564512}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.06272684866048131, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002451752863709562}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.16040449198937512, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003451151403396264}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.07545945954698206, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0021937365311268525}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.11538953087200472, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.00346589668028597}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.30175228398138576, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004432821529779081}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.14062442945039813, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0028845693981807787}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.11767324801231895, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0035688465184641432}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3033452610052125, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004447592173686305}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.14250354572951518, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002959772587239683}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..a230bfd7d9bec485da9e0a2a7d0ad2b6d6b9586a
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.5387522714694768, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.023700066593108533}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.14028377513050538, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004341036446200867}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.33808480225227105, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004840897586860907}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1660401631118107, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037496181550606754}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07249658632299814, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029199126351436076}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1738659001719329, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003647562909324833}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08448456167004659, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.002619780755266116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12529450659786737, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003740919506421422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3182618579312013, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004525730253384199}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15093871662748365, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0032225789949202046}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.12866365606890057, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003897701131855631}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3209266755398485, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004550196873941636}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15379137375695537, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0033296076273151513}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..90865ba04e38183e8de5ce969fb3a9308f936f0c
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6154728892401495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05492052153457257}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.1416216780254008, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004403560474088744}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.350369181874358, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004821260815960307}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.16918194985742865, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003726050287797417}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.07377427634131609, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0029769913238928973}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.18193245830782417, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003657633632618109}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.08639654396535042, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0025466510150414677}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.12691040102952247, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0038030608678551996}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33016396642156515, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004483672398798779}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.15419230913013102, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0031966205747524113}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.1301329876623986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003962229410505262}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.3326898264482343, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004514627367420683}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.15686161901707832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003303387308503824}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..8cf2365c87d86707dfad7e575b4cb74ca8c336b2
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.6280080728696126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.04207890973166607}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.15509132478375567, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004705843597864089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.36072408074578627, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004932090632111422}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.1785781279253249, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003864561091208021}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.08180998073756966, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0031797028106942695}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.1895120629191651, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0037580265385154963}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.09234282351104058, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0026995310439716595}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.13813720543380648, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004053616600136467}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.33922496409575775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004622505715509403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.1620344936907299, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.003324512248892316}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.14168764741861828, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004239364453934911}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.34078189394441194, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004625313010789047}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.16446961325308998, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n  ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0034245269422962608}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..8a134d07c0309a33d6725b18834226837cf3dc3a
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.14831832571904127, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0020031574591708096}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.23669379639579727, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002619263402445374}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16843326604027012, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0018787988149176356}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.027495279137204204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0008008675778483577}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04558631579219217, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0013034624565288761}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03124497623150672, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.000826359540629872}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.11944058601706234, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014613554843883904}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.197662087047554, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0022098401272736727}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1373698337101501, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014061340796650912}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.13662513471708013, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001834224913870075}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.21848967035350617, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00242500324850602}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15530559329563356, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017277092690194619}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.4416513225274035, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.07150581722616807}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 0, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..475998c237a807d9d5d36709a3cfcba6a0fdd3de
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.2474989333475308, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004192775409890903}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.16589702996724112, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002477684211999076}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.1595930665634592, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020260063889951897}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.06456439217534583, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0026036519415425876}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.03395563608028862, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0011867665512589468}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.03442090774378983, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0011197085684525982}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.2007100384592662, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0036402244436710527}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.12968060330920403, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0019100623389590725}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.12543184498335516, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0015786460435038327}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.2356687196157363, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004040097148447038}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.15692818247747958, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023149233432171795}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15113313197846198, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0019006022698345565}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.9112541973999004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08163120654641273}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 1, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f5e4839e410ad3a966e6c304c46ec5e3744ddd0
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.3671552578777415, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00464310641179873}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.18316424412100443, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002534818661881543}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.20013093911819188, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0021449192871676984}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.11652986149569292, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033165278944766326}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.048557963351484004, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012897480876968325}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.05444986552431719, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012835369876590637}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.3030100733810117, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004160446544180668}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.1454519391726235, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0020102539956042787}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.1600577177279773, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017194821563251488}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.34948494261958846, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00452044298551405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.17241597783925564, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0023681037127358745}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.18899973850835908, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020281161914398288}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 2.4920293719291493, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06820740373159184}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 2, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..d786a5cc8dd6655cd3010e63710e5b0eec5d4a51
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.31853023306817924, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.005101746577027814}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.14722792301810253, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026467115286204013}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.16433308908585867, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002402134739956357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.10260882729326853, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0033488070271773295}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.04037648707007591, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012615029724490608}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.04583954485462417, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012897030660356447}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.26690611784240603, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004530048771512807}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.11874183155575303, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0021251987841602225}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.13364146877613045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00195553742839865}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.3046492013868888, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004949401564936648}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.1393491071946238, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0024976587731928646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.15587618164528091, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002273493914941302}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.6055821821151304, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.06565825928935355}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 3, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e0d4e4ca13ac6541ca2fdf4ba7efd9a06a8d19a
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.10914047578759525, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.004168163234199949}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.04875414041264775, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0020018963702338544}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.05455257649733116, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.002012404693346845}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.037261877113724175, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.002437336913811512}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.013448412424643126, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0008658300088105147}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.01577378906241384, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0009385589100462621}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.09325777757588789, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.003680574837942309}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.03991105511182912, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0016279952050045603}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.045100032510421204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0016666399875103265}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.10390247551362326, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004006559051889402}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.04597031630948619, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0018861168846922784}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.051422475813845084, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018931617381586926}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 0.006932560692974423, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0017990239865701577}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..6b0cd28d51bcb0fa91851c98443c37dc91002ebd
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.019024182319545703, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019567530452185085}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.00809080049652358, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008930277662354756}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009250948371284507, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0009365636508956353}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.007878162042298584, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0012119853940782482}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0024981499124714754, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00036249889299460995}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.0031062476201456015, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00043823448085888057}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.016781942931569222, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017964368358031513}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.006703878314586593, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0007330971808896851}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.007784924334651645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0007922903839285351}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.01855296243342967, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001923182050974051}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.00783323994370169, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0008689584836341853}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008943947340808445, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0009064353361518755}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.5486400435190646e-20, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 5.928974165985541e-18}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..db07e847193c6ecbe5f1d364a8cc85498c3edaf4
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.06536803730199292, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0040080446277792475}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0562578647719299, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.003466710519052521}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.05638135159346031, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0033862796262482922}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.013453387419618108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0014082914388032289}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.012484556808642255, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0012859869239756115}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.01218356666574454, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0012325430366165824}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.049695294130091966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0031067522139507655}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.04191724945748147, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0026229390757436582}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.042197213021247744, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0025684294584781297}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0501407378152416, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.00312422462775062}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.04244935232602974, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0026544789695008624}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.042656324273577836, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.002590374442922286}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.114133025618453, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03902166340342278}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 4, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c793b055cfb7f8b2b49ff5b83c3b83a89871d074
--- /dev/null
+++ b/evaluation/generation/agg.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1 @@
+{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}}
\ No newline at end of file
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..9d45c2505bca1318cda5c7928e1902f1eac12e1d 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da240232ef9803594f89287674f0f363b3ee085ab5ada3f7ba1fcf42c74238a9
+size 4174292
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..66a02feda232d18a6b6dce7f60a434a9c5f67645 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1792872b8360c67aa01618d992d596079f04524a1f1503f34e0c5a8ea05d498
+size 4762910
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..557b9aea14a3f72b806733a789b376e00039e7c7 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9b9f81e6fa3c44ef15ccfd5e9a1b3a9c9e0ca5f9d010e2e5a1f113c7e000edae
+size 5711084
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..29e11ea80075b543fbfa3eadec1fe22923104d3b 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b6815796d6e2d668597f9433d5131bd8c3111fe44288ea40c86c6e7ce22c2c5a
+size 6596539
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..88686f9ab4b7d88483b3d0b86632ddaedaf0ca91 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bfb2896e5849621f3f7a89e6828f03535322465ea21e99c1dc62eee20af2496
+size 7512274
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c6307ae5f63066e55ad7c123b9d415224301fdf8 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6857bff28723014fc359741f07d82b77ccad062bda89d8b023ae9517dfde3048
+size 8376795
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..732d3df044b54c4d576e765af6b66cf7d49ede50 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c6cae8b73c1ba64dd6f8a95d50808d0f96c3442588e0ee7c521e666397beb40
+size 12958324
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..aa5c07f6c59078d2fbbe29e9a98246ef8ad15065 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa0eff950ed43d1c253f2b1a3f259895bbffa3bee9d2b6502468fe7c3de050b8
+size 18431550
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8a1a1aae7e88cf0a75a6669233bc65f8063297f9 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7974ff13a24110559f516d7f81ffc6072c020bc498ceca28c4e7640bfc930e9e
+size 23921835
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..26925b4e534bbded415d06a83413693fe11b0c6a 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b312eade5aa25054bd7f12b3e19c5d7ea6499e17edc996b6caa4bdcfca0d531a
+size 29333993
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..30255018b0818df0e08f651520126acc6b1c7bce 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26e70db861ffdbfc8cf88d452d03a5b56b13a4b70c3a538eb70524271e3f7804
+size 34778872
diff --git a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0aaee5fe029787cee5ee904b4410f5c9a2883942 100644
--- a/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl
+++ b/evaluation/generation/examples.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65d4d0227bd4d26f3e9a99d734f3d030328c20d626c6f922920b3dc155977f38
+size 13896009
diff --git a/evaluation/generation/merged.csv b/evaluation/generation/merged.csv
new file mode 100644
index 0000000000000000000000000000000000000000..5f415c367a2fa331512a8a072e9caca8883e8766
--- /dev/null
+++ b/evaluation/generation/merged.csv
@@ -0,0 +1,53 @@
+dataset,fewshots,prompt,metric,value
+e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.07031535968691954
+e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.07031535968691954
+e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.21317325143407617
+e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.21317325143407617
+e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.23375071908985015
+e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.23375071908985015
+e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.238646665315093
+e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.238646665315093
+e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23937430569965518
+e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23937430569965518
+e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.23628094628078694
+e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.23628094628078694
+e2e_nlg_cleaned,5,average,multiple,0.20525687458439684
+gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.02780714664643612
+gem_xsum,0,median,rouge2_fmeasure,0.02780714664643612
+gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04500583650450346
+gem_xsum,1,median,rouge2_fmeasure,0.04500583650450346
+gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.051882326943507785
+gem_xsum,2,median,rouge2_fmeasure,0.051882326943507785
+gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.05187808695380671
+gem_xsum,3,median,rouge2_fmeasure,0.05187808695380671
+gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01218356666574454
+gem_xsum,4,median,rouge2_fmeasure,0.01218356666574454
+gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0
+gem_xsum,5,median,rouge2_fmeasure,0.0
+gem_xsum,5,average,multiple,0.031459493952333106
+web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.03885973727074715
+web_nlg_en,0,median,rouge2_fmeasure,0.03885973727074715
+web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0635687916821055
+web_nlg_en,1,median,rouge2_fmeasure,0.0635687916821055
+web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.07545945954698206
+web_nlg_en,2,median,rouge2_fmeasure,0.07545945954698206
+web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.08448456167004659
+web_nlg_en,3,median,rouge2_fmeasure,0.08448456167004659
+web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.08639654396535042
+web_nlg_en,4,median,rouge2_fmeasure,0.08639654396535042
+web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.09234282351104058
+web_nlg_en,5,median,rouge2_fmeasure,0.09234282351104058
+web_nlg_en,5,average,multiple,0.07351865294104538
+wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03124497623150672
+wiki_lingua_en,0,median,rouge2_fmeasure,0.03124497623150672
+wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.03442090774378983
+wiki_lingua_en,1,median,rouge2_fmeasure,0.03442090774378983
+wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05444986552431719
+wiki_lingua_en,2,median,rouge2_fmeasure,0.05444986552431719
+wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04583954485462417
+wiki_lingua_en,3,median,rouge2_fmeasure,0.04583954485462417
+wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.01577378906241384
+wiki_lingua_en,4,median,rouge2_fmeasure,0.01577378906241384
+wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0031062476201456015
+wiki_lingua_en,5,median,rouge2_fmeasure,0.0031062476201456015
+wiki_lingua_en,5,average,multiple,0.030805888506132893
diff --git a/evaluation/generation/merged.json b/evaluation/generation/merged.json
new file mode 100644
index 0000000000000000000000000000000000000000..19363da1a22475f96542a0f68f986c6959491b00
--- /dev/null
+++ b/evaluation/generation/merged.json
@@ -0,0 +1 @@
+{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2562657244023514, "bleu_stderr": 0.028344931968085282, "rouge1_fmeasure": 0.08917647320322222, "rouge1_fmeasure_stderr": 0.001969587935532424, "rouge1_precision": 0.062384701456087446, "rouge1_precision_stderr": 0.0020959572066061704, "rouge1_recall": 0.2778238258177499, "rouge1_recall_stderr": 0.0053586090146941585, "rouge2_fmeasure": 0.03885973727074715, "rouge2_fmeasure_stderr": 0.0011411233576479184, "rouge2_precision": 0.026957871325524504, "rouge2_precision_stderr": 0.00119791534895436, "rouge2_recall": 0.12120656565208492, "rouge2_recall_stderr": 0.003245668709085193, "rougeL_fmeasure": 0.08588431201368854, "rougeL_fmeasure_stderr": 0.0018539010477647254, "rougeL_precision": 0.059948665447718395, "rougeL_precision_stderr": 0.001993918360545074, "rougeL_recall": 0.2696777384075072, "rougeL_recall_stderr": 0.005223094399514082, "rougeLsum_fmeasure": 0.08392617035294098, "rougeLsum_fmeasure_stderr": 0.001881634203344931, "rougeLsum_precision": 0.05914169795430463, "rougeLsum_precision_stderr": 0.0020593373696654797, "rougeLsum_recall": 0.25838316242026105, "rougeLsum_recall_stderr": 0.00491849430291812}}, "1": {"PALM_prompt": {"bleu": 0.44906636509622533, "bleu_stderr": 0.039450588630595015, "rouge1_fmeasure": 0.13316786961188862, "rouge1_fmeasure_stderr": 0.0032035615287408364, "rouge1_precision": 0.11288061517490074, "rouge1_precision_stderr": 0.00386994874457578, "rouge1_recall": 0.27968139569340955, "rouge1_recall_stderr": 0.004818798730073509, "rouge2_fmeasure": 0.0635687916821055, "rouge2_fmeasure_stderr": 0.0020746952988998694, "rouge2_precision": 0.052903771126418274, "rouge2_precision_stderr": 0.0024560083140397187, "rouge2_recall": 0.13670929059101208, "rouge2_recall_stderr": 0.0032752111527625427, "rougeL_fmeasure": 0.12246686808714284, "rougeL_fmeasure_stderr": 0.0027865982212275067, "rougeL_precision": 0.10294903334837356, "rougeL_precision_stderr": 0.0034785078085129637, "rougeL_recall": 0.26443193955147987, "rougeL_recall_stderr": 0.004508721603501755, "rougeLsum_fmeasure": 0.12355055054182587, "rougeLsum_fmeasure_stderr": 0.002841014595626855, "rougeLsum_precision": 0.1043770278231783, "rougeLsum_precision_stderr": 0.0035484629358203635, "rougeLsum_recall": 0.2648398336113232, "rougeLsum_recall_stderr": 0.004503000799632382}}, "2": {"PALM_prompt": {"bleu": 0.4687852352128821, "bleu_stderr": 0.037809750600987985, "rouge1_fmeasure": 0.15485025651871712, "rouge1_fmeasure_stderr": 0.0034302178802564512, "rouge1_precision": 0.12939788800380767, "rouge1_precision_stderr": 0.004056238000462917, "rouge1_recall": 0.3205721940394599, "rouge1_recall_stderr": 0.004784451352989951, "rouge2_fmeasure": 0.07545945954698206, "rouge2_fmeasure_stderr": 0.0021937365311268525, "rouge2_precision": 0.06272684866048131, "rouge2_precision_stderr": 0.002451752863709562, "rouge2_recall": 0.16040449198937512, "rouge2_recall_stderr": 0.003451151403396264, "rougeL_fmeasure": 0.14062442945039813, "rougeL_fmeasure_stderr": 0.0028845693981807787, "rougeL_precision": 0.11538953087200472, "rougeL_precision_stderr": 0.00346589668028597, "rougeL_recall": 0.30175228398138576, "rougeL_recall_stderr": 0.004432821529779081, "rougeLsum_fmeasure": 0.14250354572951518, "rougeLsum_fmeasure_stderr": 0.002959772587239683, "rougeLsum_precision": 0.11767324801231895, "rougeLsum_precision_stderr": 0.0035688465184641432, "rougeLsum_recall": 0.3033452610052125, "rougeLsum_recall_stderr": 0.004447592173686305}}, "3": {"PALM_prompt": {"bleu": 0.5387522714694768, "bleu_stderr": 0.023700066593108533, "rouge1_fmeasure": 0.1660401631118107, "rouge1_fmeasure_stderr": 0.0037496181550606754, "rouge1_precision": 0.14028377513050538, "rouge1_precision_stderr": 0.004341036446200867, "rouge1_recall": 0.33808480225227105, "rouge1_recall_stderr": 0.004840897586860907, "rouge2_fmeasure": 0.08448456167004659, "rouge2_fmeasure_stderr": 0.002619780755266116, "rouge2_precision": 0.07249658632299814, "rouge2_precision_stderr": 0.0029199126351436076, "rouge2_recall": 0.1738659001719329, "rouge2_recall_stderr": 0.003647562909324833, "rougeL_fmeasure": 0.15093871662748365, "rougeL_fmeasure_stderr": 0.0032225789949202046, "rougeL_precision": 0.12529450659786737, "rougeL_precision_stderr": 0.003740919506421422, "rougeL_recall": 0.3182618579312013, "rougeL_recall_stderr": 0.004525730253384199, "rougeLsum_fmeasure": 0.15379137375695537, "rougeLsum_fmeasure_stderr": 0.0033296076273151513, "rougeLsum_precision": 0.12866365606890057, "rougeLsum_precision_stderr": 0.003897701131855631, "rougeLsum_recall": 0.3209266755398485, "rougeLsum_recall_stderr": 0.004550196873941636}}, "4": {"PALM_prompt": {"bleu": 0.6154728892401495, "bleu_stderr": 0.05492052153457257, "rouge1_fmeasure": 0.16918194985742865, "rouge1_fmeasure_stderr": 0.003726050287797417, "rouge1_precision": 0.1416216780254008, "rouge1_precision_stderr": 0.004403560474088744, "rouge1_recall": 0.350369181874358, "rouge1_recall_stderr": 0.004821260815960307, "rouge2_fmeasure": 0.08639654396535042, "rouge2_fmeasure_stderr": 0.0025466510150414677, "rouge2_precision": 0.07377427634131609, "rouge2_precision_stderr": 0.0029769913238928973, "rouge2_recall": 0.18193245830782417, "rouge2_recall_stderr": 0.003657633632618109, "rougeL_fmeasure": 0.15419230913013102, "rougeL_fmeasure_stderr": 0.0031966205747524113, "rougeL_precision": 0.12691040102952247, "rougeL_precision_stderr": 0.0038030608678551996, "rougeL_recall": 0.33016396642156515, "rougeL_recall_stderr": 0.004483672398798779, "rougeLsum_fmeasure": 0.15686161901707832, "rougeLsum_fmeasure_stderr": 0.003303387308503824, "rougeLsum_precision": 0.1301329876623986, "rougeLsum_precision_stderr": 0.003962229410505262, "rougeLsum_recall": 0.3326898264482343, "rougeLsum_recall_stderr": 0.004514627367420683}}, "5": {"PALM_prompt": {"bleu": 0.6280080728696126, "bleu_stderr": 0.04207890973166607, "rouge1_fmeasure": 0.1785781279253249, "rouge1_fmeasure_stderr": 0.003864561091208021, "rouge1_precision": 0.15509132478375567, "rouge1_precision_stderr": 0.004705843597864089, "rouge1_recall": 0.36072408074578627, "rouge1_recall_stderr": 0.004932090632111422, "rouge2_fmeasure": 0.09234282351104058, "rouge2_fmeasure_stderr": 0.0026995310439716595, "rouge2_precision": 0.08180998073756966, "rouge2_precision_stderr": 0.0031797028106942695, "rouge2_recall": 0.1895120629191651, "rouge2_recall_stderr": 0.0037580265385154963, "rougeL_fmeasure": 0.1620344936907299, "rougeL_fmeasure_stderr": 0.003324512248892316, "rougeL_precision": 0.13813720543380648, "rougeL_precision_stderr": 0.004053616600136467, "rougeL_recall": 0.33922496409575775, "rougeL_recall_stderr": 0.004622505715509403, "rougeLsum_fmeasure": 0.16446961325308998, "rougeLsum_fmeasure_stderr": 0.0034245269422962608, "rougeLsum_precision": 0.14168764741861828, "rougeLsum_precision_stderr": 0.004239364453934911, "rougeLsum_recall": 0.34078189394441194, "rougeLsum_recall_stderr": 0.004625313010789047}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4416513225274035, "bleu_stderr": 0.07150581722616807, "rouge1_fmeasure": 0.16843326604027012, "rouge1_fmeasure_stderr": 0.0018787988149176356, "rouge1_precision": 0.14831832571904127, "rouge1_precision_stderr": 0.0020031574591708096, "rouge1_recall": 0.23669379639579727, "rouge1_recall_stderr": 0.002619263402445374, "rouge2_fmeasure": 0.03124497623150672, "rouge2_fmeasure_stderr": 0.000826359540629872, "rouge2_precision": 0.027495279137204204, "rouge2_precision_stderr": 0.0008008675778483577, "rouge2_recall": 0.04558631579219217, "rouge2_recall_stderr": 0.0013034624565288761, "rougeL_fmeasure": 0.1373698337101501, "rougeL_fmeasure_stderr": 0.0014061340796650912, "rougeL_precision": 0.11944058601706234, "rougeL_precision_stderr": 0.0014613554843883904, "rougeL_recall": 0.197662087047554, "rougeL_recall_stderr": 0.0022098401272736727, "rougeLsum_fmeasure": 0.15530559329563356, "rougeLsum_fmeasure_stderr": 0.0017277092690194619, "rougeLsum_precision": 0.13662513471708013, "rougeLsum_precision_stderr": 0.001834224913870075, "rougeLsum_recall": 0.21848967035350617, "rougeLsum_recall_stderr": 0.00242500324850602}}, "1": {"tldr_en": {"bleu": 1.9112541973999004, "bleu_stderr": 0.08163120654641273, "rouge1_fmeasure": 0.1595930665634592, "rouge1_fmeasure_stderr": 0.0020260063889951897, "rouge1_precision": 0.2474989333475308, "rouge1_precision_stderr": 0.004192775409890903, "rouge1_recall": 0.16589702996724112, "rouge1_recall_stderr": 0.002477684211999076, "rouge2_fmeasure": 0.03442090774378983, "rouge2_fmeasure_stderr": 0.0011197085684525982, "rouge2_precision": 0.06456439217534583, "rouge2_precision_stderr": 0.0026036519415425876, "rouge2_recall": 0.03395563608028862, "rouge2_recall_stderr": 0.0011867665512589468, "rougeL_fmeasure": 0.12543184498335516, "rougeL_fmeasure_stderr": 0.0015786460435038327, "rougeL_precision": 0.2007100384592662, "rougeL_precision_stderr": 0.0036402244436710527, "rougeL_recall": 0.12968060330920403, "rougeL_recall_stderr": 0.0019100623389590725, "rougeLsum_fmeasure": 0.15113313197846198, "rougeLsum_fmeasure_stderr": 0.0019006022698345565, "rougeLsum_precision": 0.2356687196157363, "rougeLsum_precision_stderr": 0.004040097148447038, "rougeLsum_recall": 0.15692818247747958, "rougeLsum_recall_stderr": 0.0023149233432171795}}, "2": {"tldr_en": {"bleu": 2.4920293719291493, "bleu_stderr": 0.06820740373159184, "rouge1_fmeasure": 0.20013093911819188, "rouge1_fmeasure_stderr": 0.0021449192871676984, "rouge1_precision": 0.3671552578777415, "rouge1_precision_stderr": 0.00464310641179873, "rouge1_recall": 0.18316424412100443, "rouge1_recall_stderr": 0.002534818661881543, "rouge2_fmeasure": 0.05444986552431719, "rouge2_fmeasure_stderr": 0.0012835369876590637, "rouge2_precision": 0.11652986149569292, "rouge2_precision_stderr": 0.0033165278944766326, "rouge2_recall": 0.048557963351484004, "rouge2_recall_stderr": 0.0012897480876968325, "rougeL_fmeasure": 0.1600577177279773, "rougeL_fmeasure_stderr": 0.0017194821563251488, "rougeL_precision": 0.3030100733810117, "rougeL_precision_stderr": 0.004160446544180668, "rougeL_recall": 0.1454519391726235, "rougeL_recall_stderr": 0.0020102539956042787, "rougeLsum_fmeasure": 0.18899973850835908, "rougeLsum_fmeasure_stderr": 0.0020281161914398288, "rougeLsum_precision": 0.34948494261958846, "rougeLsum_precision_stderr": 0.00452044298551405, "rougeLsum_recall": 0.17241597783925564, "rougeLsum_recall_stderr": 0.0023681037127358745}}, "3": {"tldr_en": {"bleu": 1.6055821821151304, "bleu_stderr": 0.06565825928935355, "rouge1_fmeasure": 0.16433308908585867, "rouge1_fmeasure_stderr": 0.002402134739956357, "rouge1_precision": 0.31853023306817924, "rouge1_precision_stderr": 0.005101746577027814, "rouge1_recall": 0.14722792301810253, "rouge1_recall_stderr": 0.0026467115286204013, "rouge2_fmeasure": 0.04583954485462417, "rouge2_fmeasure_stderr": 0.0012897030660356447, "rouge2_precision": 0.10260882729326853, "rouge2_precision_stderr": 0.0033488070271773295, "rouge2_recall": 0.04037648707007591, "rouge2_recall_stderr": 0.0012615029724490608, "rougeL_fmeasure": 0.13364146877613045, "rougeL_fmeasure_stderr": 0.00195553742839865, "rougeL_precision": 0.26690611784240603, "rougeL_precision_stderr": 0.004530048771512807, "rougeL_recall": 0.11874183155575303, "rougeL_recall_stderr": 0.0021251987841602225, "rougeLsum_fmeasure": 0.15587618164528091, "rougeLsum_fmeasure_stderr": 0.002273493914941302, "rougeLsum_precision": 0.3046492013868888, "rougeLsum_precision_stderr": 0.004949401564936648, "rougeLsum_recall": 0.1393491071946238, "rougeLsum_recall_stderr": 0.0024976587731928646}}, "4": {"tldr_en": {"bleu": 0.006932560692974423, "bleu_stderr": 0.0017990239865701577, "rouge1_fmeasure": 0.05455257649733116, "rouge1_fmeasure_stderr": 0.002012404693346845, "rouge1_precision": 0.10914047578759525, "rouge1_precision_stderr": 0.004168163234199949, "rouge1_recall": 0.04875414041264775, "rouge1_recall_stderr": 0.0020018963702338544, "rouge2_fmeasure": 0.01577378906241384, "rouge2_fmeasure_stderr": 0.0009385589100462621, "rouge2_precision": 0.037261877113724175, "rouge2_precision_stderr": 0.002437336913811512, "rouge2_recall": 0.013448412424643126, "rouge2_recall_stderr": 0.0008658300088105147, "rougeL_fmeasure": 0.045100032510421204, "rougeL_fmeasure_stderr": 0.0016666399875103265, "rougeL_precision": 0.09325777757588789, "rougeL_precision_stderr": 0.003680574837942309, "rougeL_recall": 0.03991105511182912, "rougeL_recall_stderr": 0.0016279952050045603, "rougeLsum_fmeasure": 0.051422475813845084, "rougeLsum_fmeasure_stderr": 0.0018931617381586926, "rougeLsum_precision": 0.10390247551362326, "rougeLsum_precision_stderr": 0.004006559051889402, "rougeLsum_recall": 0.04597031630948619, "rougeLsum_recall_stderr": 0.0018861168846922784}}, "5": {"tldr_en": {"bleu": 1.5486400435190646e-20, "bleu_stderr": 5.928974165985541e-18, "rouge1_fmeasure": 0.009250948371284507, "rouge1_fmeasure_stderr": 0.0009365636508956353, "rouge1_precision": 0.019024182319545703, "rouge1_precision_stderr": 0.0019567530452185085, "rouge1_recall": 0.00809080049652358, "rouge1_recall_stderr": 0.0008930277662354756, "rouge2_fmeasure": 0.0031062476201456015, "rouge2_fmeasure_stderr": 0.00043823448085888057, "rouge2_precision": 0.007878162042298584, "rouge2_precision_stderr": 0.0012119853940782482, "rouge2_recall": 0.0024981499124714754, "rouge2_recall_stderr": 0.00036249889299460995, "rougeL_fmeasure": 0.007784924334651645, "rougeL_fmeasure_stderr": 0.0007922903839285351, "rougeL_precision": 0.016781942931569222, "rougeL_precision_stderr": 0.0017964368358031513, "rougeL_recall": 0.006703878314586593, "rougeL_recall_stderr": 0.0007330971808896851, "rougeLsum_fmeasure": 0.008943947340808445, "rougeLsum_fmeasure_stderr": 0.0009064353361518755, "rougeLsum_precision": 0.01855296243342967, "rougeLsum_precision_stderr": 0.001923182050974051, "rougeLsum_recall": 0.00783323994370169, "rougeLsum_recall_stderr": 0.0008689584836341853}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.9626050306721474, "bleu_stderr": 0.06024253979194259, "rouge1_fmeasure": 0.20284230759257002, "rouge1_fmeasure_stderr": 0.0017464348006575089, "rouge1_precision": 0.1540985111881967, "rouge1_precision_stderr": 0.0014515620097711474, "rouge1_recall": 0.31140262883046665, "rouge1_recall_stderr": 0.0024251988602698067, "rouge2_fmeasure": 0.07031535968691954, "rouge2_fmeasure_stderr": 0.001168196298024764, "rouge2_precision": 0.05355018382472272, "rouge2_precision_stderr": 0.0009106172876128309, "rouge2_recall": 0.10690154622755875, "rouge2_recall_stderr": 0.0017688696499961142, "rougeL_fmeasure": 0.1753585977687643, "rougeL_fmeasure_stderr": 0.0014608296346256319, "rougeL_precision": 0.1327617315507999, "rougeL_precision_stderr": 0.001197611976138011, "rougeL_recall": 0.2711114650902915, "rougeL_recall_stderr": 0.0021100724050297824, "rougeLsum_fmeasure": 0.18282034804175978, "rougeLsum_fmeasure_stderr": 0.0016241979991028408, "rougeLsum_precision": 0.1387916405736721, "rougeLsum_precision_stderr": 0.0013406445609266567, "rougeLsum_recall": 0.2810609872616038, "rougeLsum_recall_stderr": 0.0022804790016154107}}, "1": {"generate_text_restaurant": {"bleu": 11.593827123577368, "bleu_stderr": 0.12407021076125696, "rouge1_fmeasure": 0.45020216196031243, "rouge1_fmeasure_stderr": 0.0023747076071133698, "rouge1_precision": 0.5509122991009537, "rouge1_precision_stderr": 0.0033661250504397264, "rouge1_recall": 0.42140078880413323, "rouge1_recall_stderr": 0.002953747532093766, "rouge2_fmeasure": 0.21317325143407617, "rouge2_fmeasure_stderr": 0.0020127889592610262, "rouge2_precision": 0.2650742428626904, "rouge2_precision_stderr": 0.0027402930606959166, "rouge2_recall": 0.19921132653073137, "rouge2_recall_stderr": 0.0021274437677356846, "rougeL_fmeasure": 0.32668549807617586, "rougeL_fmeasure_stderr": 0.002086318029186934, "rougeL_precision": 0.40311524575775876, "rougeL_precision_stderr": 0.003054424817440509, "rougeL_recall": 0.30479322858663915, "rougeL_recall_stderr": 0.0023980133198678024, "rougeLsum_fmeasure": 0.3686111369512851, "rougeLsum_fmeasure_stderr": 0.0023418416254223887, "rougeLsum_precision": 0.4523945212247963, "rougeLsum_precision_stderr": 0.00326815326036064, "rougeLsum_recall": 0.3446091324717908, "rougeLsum_recall_stderr": 0.0027191831013193444}}, "2": {"generate_text_restaurant": {"bleu": 12.88190096920724, "bleu_stderr": 0.17583922253090617, "rouge1_fmeasure": 0.4721264666575481, "rouge1_fmeasure_stderr": 0.002268547262390229, "rouge1_precision": 0.5822720419906148, "rouge1_precision_stderr": 0.0032962978807536585, "rouge1_recall": 0.4337280880183242, "rouge1_recall_stderr": 0.0028431977686949614, "rouge2_fmeasure": 0.23375071908985015, "rouge2_fmeasure_stderr": 0.0020523859536162883, "rouge2_precision": 0.2931157748619445, "rouge2_precision_stderr": 0.0028165108542885105, "rouge2_recall": 0.2147077046167617, "rouge2_recall_stderr": 0.0021786795843148085, "rougeL_fmeasure": 0.3522716641150115, "rougeL_fmeasure_stderr": 0.0020991364053741914, "rougeL_precision": 0.4369685674136242, "rougeL_precision_stderr": 0.003073517456588823, "rougeL_recall": 0.32297689774894583, "rougeL_recall_stderr": 0.0024097063140581243, "rougeLsum_fmeasure": 0.393682260751447, "rougeLsum_fmeasure_stderr": 0.0023007053568388975, "rougeLsum_precision": 0.4863687819740207, "rougeLsum_precision_stderr": 0.0032507530347318466, "rougeLsum_recall": 0.3613566974986443, "rougeLsum_recall_stderr": 0.0026568660208198115}}, "3": {"generate_text_restaurant": {"bleu": 13.262436270778784, "bleu_stderr": 0.12966668718803287, "rouge1_fmeasure": 0.47525930406845607, "rouge1_fmeasure_stderr": 0.0022663496963600087, "rouge1_precision": 0.5813922477147344, "rouge1_precision_stderr": 0.0032594641701596022, "rouge1_recall": 0.43750260743630337, "rouge1_recall_stderr": 0.002850951212768957, "rouge2_fmeasure": 0.238646665315093, "rouge2_fmeasure_stderr": 0.0020938647359093243, "rouge2_precision": 0.2963848070406136, "rouge2_precision_stderr": 0.002811561987291426, "rouge2_recall": 0.2198532494536229, "rouge2_recall_stderr": 0.0022399911777690423, "rougeL_fmeasure": 0.3550920935079866, "rougeL_fmeasure_stderr": 0.002168420115905195, "rougeL_precision": 0.43608943087396823, "rougeL_precision_stderr": 0.0030728003407875415, "rougeL_recall": 0.32653357863256616, "rougeL_recall_stderr": 0.002478214580278462, "rougeLsum_fmeasure": 0.3982122870175536, "rougeLsum_fmeasure_stderr": 0.002333698912202139, "rougeLsum_precision": 0.4873959951365985, "rougeLsum_precision_stderr": 0.0032309608357424142, "rougeLsum_recall": 0.3665706405925854, "rougeLsum_recall_stderr": 0.0027071065833861527}}, "4": {"generate_text_restaurant": {"bleu": 13.476662518674425, "bleu_stderr": 0.14804924963101374, "rouge1_fmeasure": 0.4751483029381261, "rouge1_fmeasure_stderr": 0.0022939871226639315, "rouge1_precision": 0.575148309756594, "rouge1_precision_stderr": 0.0032456743499240143, "rouge1_recall": 0.4379553251280355, "rouge1_recall_stderr": 0.0027875515895071923, "rouge2_fmeasure": 0.23937430569965518, "rouge2_fmeasure_stderr": 0.0021223157395609733, "rouge2_precision": 0.2932213385624628, "rouge2_precision_stderr": 0.0027850927611603377, "rouge2_recall": 0.220787156975807, "rouge2_recall_stderr": 0.0022308643708334754, "rougeL_fmeasure": 0.35385483221937813, "rougeL_fmeasure_stderr": 0.0021780339813240314, "rougeL_precision": 0.4297630033921168, "rougeL_precision_stderr": 0.0030250335936550877, "rougeL_recall": 0.3260216890650744, "rougeL_recall_stderr": 0.0024522487516979575, "rougeLsum_fmeasure": 0.3976384682503085, "rougeLsum_fmeasure_stderr": 0.0023729927223769785, "rougeLsum_precision": 0.4813876805218902, "rougeLsum_precision_stderr": 0.0032181540128894265, "rougeLsum_recall": 0.36675888786779215, "rougeLsum_recall_stderr": 0.002699461696539119}}, "5": {"generate_text_restaurant": {"bleu": 13.156897203455948, "bleu_stderr": 0.08630136341873869, "rouge1_fmeasure": 0.4729751649957838, "rouge1_fmeasure_stderr": 0.0022801925622949843, "rouge1_precision": 0.5738627336760422, "rouge1_precision_stderr": 0.0032894505336520264, "rouge1_recall": 0.4353618479598648, "rouge1_recall_stderr": 0.0027767731270488508, "rouge2_fmeasure": 0.23628094628078694, "rouge2_fmeasure_stderr": 0.00211214055033469, "rouge2_precision": 0.29086794940233046, "rouge2_precision_stderr": 0.002818639353471394, "rouge2_recall": 0.21741764823749096, "rouge2_recall_stderr": 0.002221377915423391, "rougeL_fmeasure": 0.3534318578391924, "rougeL_fmeasure_stderr": 0.0021797527155027353, "rougeL_precision": 0.4304412705229436, "rougeL_precision_stderr": 0.003073791049621951, "rougeL_recall": 0.32508975084283026, "rougeL_recall_stderr": 0.0024528402950547467, "rougeLsum_fmeasure": 0.39642244501779056, "rougeLsum_fmeasure_stderr": 0.002354679808047655, "rougeLsum_precision": 0.4815893698288744, "rougeLsum_precision_stderr": 0.0032662227256431515, "rougeLsum_recall": 0.36475434401700674, "rougeLsum_recall_stderr": 0.00266853275926591}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.0285249742430023, "bleu_stderr": 0.04658561770880105, "rouge1_fmeasure": 0.1478842341216362, "rouge1_fmeasure_stderr": 0.0027472203051295764, "rouge1_precision": 0.10711262762227787, "rouge1_precision_stderr": 0.0020601915536585967, "rouge1_recall": 0.25337181483295146, "rouge1_recall_stderr": 0.0046856436696164905, "rouge2_fmeasure": 0.02780714664643612, "rouge2_fmeasure_stderr": 0.0012134748561574466, "rouge2_precision": 0.019781013833534398, "rouge2_precision_stderr": 0.0008722452654034713, "rouge2_recall": 0.04912277748197487, "rouge2_recall_stderr": 0.0021576559260432712, "rougeL_fmeasure": 0.11266163818049864, "rougeL_fmeasure_stderr": 0.0020299599299667603, "rougeL_precision": 0.08153762034631415, "rougeL_precision_stderr": 0.0015205040880351287, "rougeL_recall": 0.19377740656973594, "rougeL_recall_stderr": 0.0035212605780510554, "rougeLsum_fmeasure": 0.11968068511417429, "rougeLsum_fmeasure_stderr": 0.0021748845339722685, "rougeLsum_precision": 0.08657778420332544, "rougeLsum_precision_stderr": 0.0016263154802718866, "rougeLsum_recall": 0.20583552514369016, "rougeLsum_recall_stderr": 0.003771689144281502}}, "1": {"article_DOC_summary": {"bleu": 2.1792875082477123, "bleu_stderr": 0.06319420014636688, "rouge1_fmeasure": 0.2183934186796248, "rouge1_fmeasure_stderr": 0.0031131023395006203, "rouge1_precision": 0.2154872949099143, "rouge1_precision_stderr": 0.0037469904189591395, "rouge1_recall": 0.25867586586469277, "rouge1_recall_stderr": 0.004134512250176692, "rouge2_fmeasure": 0.04500583650450346, "rouge2_fmeasure_stderr": 0.001863892149786922, "rouge2_precision": 0.0447245429308555, "rouge2_precision_stderr": 0.0020714108653434506, "rouge2_recall": 0.05502827731187604, "rouge2_recall_stderr": 0.0023758194054523916, "rougeL_fmeasure": 0.1635144970491249, "rougeL_fmeasure_stderr": 0.0024820064917382306, "rougeL_precision": 0.1614628963956234, "rougeL_precision_stderr": 0.0030120325913654804, "rougeL_recall": 0.19472956411902964, "rougeL_recall_stderr": 0.0033235017592203077, "rougeLsum_fmeasure": 0.1671666603817299, "rougeLsum_fmeasure_stderr": 0.0025413985087851003, "rougeLsum_precision": 0.16439844895006708, "rougeLsum_precision_stderr": 0.003021699034691555, "rougeLsum_recall": 0.2002131667137321, "rougeLsum_recall_stderr": 0.003522860713518912}}, "2": {"article_DOC_summary": {"bleu": 2.9719119603418074, "bleu_stderr": 0.16832095088560176, "rouge1_fmeasure": 0.23579050025596546, "rouge1_fmeasure_stderr": 0.0032208877086972645, "rouge1_precision": 0.24578221078853943, "rouge1_precision_stderr": 0.0038311545433087246, "rouge1_recall": 0.25247232169607386, "rouge1_recall_stderr": 0.003851791759690215, "rouge2_fmeasure": 0.051882326943507785, "rouge2_fmeasure_stderr": 0.002120030804437139, "rouge2_precision": 0.05385976155670831, "rouge2_precision_stderr": 0.002293715583911734, "rouge2_recall": 0.05655894066313464, "rouge2_recall_stderr": 0.002406489537857907, "rougeL_fmeasure": 0.17839101172174274, "rougeL_fmeasure_stderr": 0.0026918945920387577, "rougeL_precision": 0.18601506569279253, "rougeL_precision_stderr": 0.0031620641585856746, "rougeL_recall": 0.19159840411716447, "rougeL_recall_stderr": 0.003222530909275831, "rougeLsum_fmeasure": 0.18050054360244336, "rougeLsum_fmeasure_stderr": 0.002709070917463213, "rougeLsum_precision": 0.18783807621807994, "rougeLsum_precision_stderr": 0.003156835477899629, "rougeLsum_recall": 0.19452321319767416, "rougeLsum_recall_stderr": 0.0033253733370942227}}, "3": {"article_DOC_summary": {"bleu": 3.2041870719788714, "bleu_stderr": 0.20611661341919532, "rouge1_fmeasure": 0.2277409702430376, "rouge1_fmeasure_stderr": 0.0036665816219810165, "rouge1_precision": 0.24477878269919434, "rouge1_precision_stderr": 0.0043304256076557015, "rouge1_recall": 0.23546035253406764, "rouge1_recall_stderr": 0.0040420090423460295, "rouge2_fmeasure": 0.05187808695380671, "rouge2_fmeasure_stderr": 0.0022736997047107283, "rouge2_precision": 0.05535540959023755, "rouge2_precision_stderr": 0.002520852290920342, "rouge2_recall": 0.05400488265902618, "rouge2_recall_stderr": 0.0023942103295597683, "rougeL_fmeasure": 0.1726765296689622, "rougeL_fmeasure_stderr": 0.0030037675906149446, "rougeL_precision": 0.18676731153569484, "rougeL_precision_stderr": 0.0035994106495257387, "rougeL_recall": 0.17809432971388495, "rougeL_recall_stderr": 0.003241778335625792, "rougeLsum_fmeasure": 0.1742948037879305, "rougeLsum_fmeasure_stderr": 0.0030227160469161974, "rougeLsum_precision": 0.1881533844870454, "rougeLsum_precision_stderr": 0.003601435982381053, "rougeLsum_recall": 0.18040945162861388, "rougeLsum_recall_stderr": 0.0033378628217040685}}, "4": {"article_DOC_summary": {"bleu": 0.114133025618453, "bleu_stderr": 0.03902166340342278, "rouge1_fmeasure": 0.05638135159346031, "rouge1_fmeasure_stderr": 0.0033862796262482922, "rouge1_precision": 0.06536803730199292, "rouge1_precision_stderr": 0.0040080446277792475, "rouge1_recall": 0.0562578647719299, "rouge1_recall_stderr": 0.003466710519052521, "rouge2_fmeasure": 0.01218356666574454, "rouge2_fmeasure_stderr": 0.0012325430366165824, "rouge2_precision": 0.013453387419618108, "rouge2_precision_stderr": 0.0014082914388032289, "rouge2_recall": 0.012484556808642255, "rouge2_recall_stderr": 0.0012859869239756115, "rougeL_fmeasure": 0.042197213021247744, "rougeL_fmeasure_stderr": 0.0025684294584781297, "rougeL_precision": 0.049695294130091966, "rougeL_precision_stderr": 0.0031067522139507655, "rougeL_recall": 0.04191724945748147, "rougeL_recall_stderr": 0.0026229390757436582, "rougeLsum_fmeasure": 0.042656324273577836, "rougeLsum_fmeasure_stderr": 0.002590374442922286, "rougeLsum_precision": 0.0501407378152416, "rougeLsum_precision_stderr": 0.00312422462775062, "rougeLsum_recall": 0.04244935232602974, "rougeLsum_recall_stderr": 0.0026544789695008624}}, "5": {"article_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0, "rouge1_fmeasure_stderr": 0.0, "rouge1_precision": 0.0, "rouge1_precision_stderr": 0.0, "rouge1_recall": 0.0, "rouge1_recall_stderr": 0.0, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0, "rougeL_fmeasure_stderr": 0.0, "rougeL_precision": 0.0, "rougeL_precision_stderr": 0.0, "rougeL_recall": 0.0, "rougeL_recall_stderr": 0.0, "rougeLsum_fmeasure": 0.0, "rougeLsum_fmeasure_stderr": 0.0, "rougeLsum_precision": 0.0, "rougeLsum_precision_stderr": 0.0, "rougeLsum_recall": 0.0, "rougeLsum_recall_stderr": 0.0}}}}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..f66d640fcd36cf72d21df2ec8c20e601391367f8
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.2562657244023514,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.028344931968085282
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.062384701456087446,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020959572066061704
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.2778238258177499,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0053586090146941585
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.08917647320322222,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.001969587935532424
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.026957871325524504,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.00119791534895436
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.12120656565208492,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003245668709085193
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.03885973727074715,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011411233576479184
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.059948665447718395,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.001993918360545074
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.2696777384075072,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.005223094399514082
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.08588431201368854,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0018539010477647254
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.05914169795430463,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0020593373696654797
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.25838316242026105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00491849430291812
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.08392617035294098,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.001881634203344931
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..f84750876ce8420c0a309fcc42d25404ed91b2a9
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.44906636509622533,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.039450588630595015
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.11288061517490074,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00386994874457578
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.27968139569340955,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004818798730073509
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.13316786961188862,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0032035615287408364
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.052903771126418274,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0024560083140397187
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.13670929059101208,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0032752111527625427
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.0635687916821055,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0020746952988998694
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.10294903334837356,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0034785078085129637
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.26443193955147987,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004508721603501755
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.12246686808714284,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0027865982212275067
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1043770278231783,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035484629358203635
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.2648398336113232,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004503000799632382
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.12355055054182587,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002841014595626855
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..5ae49c94ac1f7e59629792dc7b6c5544f286f5cf
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.4687852352128821,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.037809750600987985
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.12939788800380767,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004056238000462917
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.3205721940394599,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004784451352989951
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.15485025651871712,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0034302178802564512
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.06272684866048131,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002451752863709562
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.16040449198937512,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003451151403396264
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.07545945954698206,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0021937365311268525
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.11538953087200472,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.00346589668028597
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.30175228398138576,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004432821529779081
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.14062442945039813,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0028845693981807787
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.11767324801231895,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.0035688465184641432
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3033452610052125,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004447592173686305
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.14250354572951518,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002959772587239683
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..ee67a8331ae0579f811dcb9442bfee314595c6ca
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.5387522714694768,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.023700066593108533
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.14028377513050538,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004341036446200867
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.33808480225227105,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004840897586860907
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.1660401631118107,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0037496181550606754
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.07249658632299814,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029199126351436076
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1738659001719329,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003647562909324833
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08448456167004659,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.002619780755266116
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12529450659786737,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003740919506421422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.3182618579312013,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004525730253384199
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.15093871662748365,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0032225789949202046
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.12866365606890057,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003897701131855631
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3209266755398485,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004550196873941636
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15379137375695537,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0033296076273151513
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..0dd2508bb2ef152493fbfe017d4df44bb33ddc27
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6154728892401495,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.05492052153457257
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.1416216780254008,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004403560474088744
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.350369181874358,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004821260815960307
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.16918194985742865,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003726050287797417
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.07377427634131609,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0029769913238928973
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.18193245830782417,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.003657633632618109
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.08639654396535042,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0025466510150414677
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.12691040102952247,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0038030608678551996
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.33016396642156515,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004483672398798779
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.15419230913013102,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0031966205747524113
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.1301329876623986,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.003962229410505262
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.3326898264482343,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004514627367420683
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.15686161901707832,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.003303387308503824
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..c51e9124726e2fc64dafbbe40587856eabc17612
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-web_nlg_en_PALM_prompt_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "bleu": 0.6280080728696126,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.04207890973166607
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_precision": 0.15509132478375567,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004705843597864089
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_recall": 0.36072408074578627,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.004932090632111422
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge1_fmeasure": 0.1785781279253249,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.003864561091208021
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_precision": 0.08180998073756966,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0031797028106942695
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_recall": 0.1895120629191651,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0037580265385154963
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rouge2_fmeasure": 0.09234282351104058,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0026995310439716595
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_precision": 0.13813720543380648,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004053616600136467
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_recall": 0.33922496409575775,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.004622505715509403
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeL_fmeasure": 0.1620344936907299,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.003324512248892316
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_precision": 0.14168764741861828,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004239364453934911
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_recall": 0.34078189394441194,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.004625313010789047
+    },
+    {
+      "task_name": "GEM/web_nlg_en",
+      "prompt_name": "PALM_prompt",
+      "rougeLsum_fmeasure": 0.16446961325308998,
+      "dataset_path": "GEM/web_nlg",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0034245269422962608
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json
new file mode 100644
index 0000000000000000000000000000000000000000..effb976d508e077c2223ef9eb32da13b24ab1dc2
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_0.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.14831832571904127,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0020031574591708096
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.23669379639579727,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002619263402445374
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.16843326604027012,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0018787988149176356
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.027495279137204204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0008008675778483577
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04558631579219217,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0013034624565288761
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03124497623150672,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.000826359540629872
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.11944058601706234,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0014613554843883904
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.197662087047554,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0022098401272736727
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1373698337101501,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0014061340796650912
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.13662513471708013,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001834224913870075
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.21848967035350617,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.00242500324850602
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15530559329563356,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0017277092690194619
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.4416513225274035,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.07150581722616807
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 0,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json
new file mode 100644
index 0000000000000000000000000000000000000000..a549d7daebca055f90284836950bad7b0ebb45ae
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_1.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.2474989333475308,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004192775409890903
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.16589702996724112,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002477684211999076
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.1595930665634592,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0020260063889951897
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.06456439217534583,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0026036519415425876
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.03395563608028862,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0011867665512589468
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.03442090774378983,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0011197085684525982
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.2007100384592662,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0036402244436710527
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.12968060330920403,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0019100623389590725
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.12543184498335516,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0015786460435038327
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.2356687196157363,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004040097148447038
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.15692818247747958,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0023149233432171795
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15113313197846198,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0019006022698345565
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.9112541973999004,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.08163120654641273
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 1,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json
new file mode 100644
index 0000000000000000000000000000000000000000..cbc23f38f5e0c730066097a7d95ab09ab1449566
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_2.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.3671552578777415,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.00464310641179873
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.18316424412100443,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.002534818661881543
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.20013093911819188,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0021449192871676984
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.11652986149569292,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0033165278944766326
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.048557963351484004,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012897480876968325
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.05444986552431719,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012835369876590637
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.3030100733810117,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004160446544180668
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.1454519391726235,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0020102539956042787
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.1600577177279773,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0017194821563251488
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.34948494261958846,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.00452044298551405
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.17241597783925564,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0023681037127358745
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.18899973850835908,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0020281161914398288
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 2.4920293719291493,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06820740373159184
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 2,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json
new file mode 100644
index 0000000000000000000000000000000000000000..1742a9bda1ed3552655d6dc7dcdb569379b7c44d
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_3.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.31853023306817924,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.005101746577027814
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.14722792301810253,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0026467115286204013
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.16433308908585867,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002402134739956357
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.10260882729326853,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0033488070271773295
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.04037648707007591,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0012615029724490608
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.04583954485462417,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0012897030660356447
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.26690611784240603,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.004530048771512807
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.11874183155575303,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0021251987841602225
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.13364146877613045,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.00195553742839865
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.3046492013868888,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004949401564936648
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.1393491071946238,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0024976587731928646
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.15587618164528091,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.002273493914941302
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.6055821821151304,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.06565825928935355
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 3,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..3e4ce0f01a20660cae2a50a11cc15ca1c1d1f226
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.10914047578759525,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.004168163234199949
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.04875414041264775,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0020018963702338544
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.05455257649733116,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.002012404693346845
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.037261877113724175,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.002437336913811512
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.013448412424643126,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.0008658300088105147
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.01577378906241384,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.0009385589100462621
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.09325777757588789,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.003680574837942309
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.03991105511182912,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0016279952050045603
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.045100032510421204,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0016666399875103265
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.10390247551362326,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.004006559051889402
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.04597031630948619,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0018861168846922784
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.051422475813845084,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0018931617381586926
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 0.006932560692974423,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 0.0017990239865701577
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..61b0a76c8ae322ff5b169ebc9cc3aae433ccf084
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_GEM-wiki_lingua_en_tldr_en_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_precision": 0.019024182319545703,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_precision_stderr": 0.0019567530452185085
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_recall": 0.00809080049652358,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_recall_stderr": 0.0008930277662354756
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge1_fmeasure": 0.009250948371284507,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge1_fmeasure_stderr": 0.0009365636508956353
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_precision": 0.007878162042298584,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_precision_stderr": 0.0012119853940782482
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_recall": 0.0024981499124714754,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_recall_stderr": 0.00036249889299460995
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rouge2_fmeasure": 0.0031062476201456015,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rouge2_fmeasure_stderr": 0.00043823448085888057
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_precision": 0.016781942931569222,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_precision_stderr": 0.0017964368358031513
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_recall": 0.006703878314586593,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_recall_stderr": 0.0007330971808896851
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeL_fmeasure": 0.007784924334651645,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeL_fmeasure_stderr": 0.0007922903839285351
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_precision": 0.01855296243342967,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_precision_stderr": 0.001923182050974051
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_recall": 0.00783323994370169,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_recall_stderr": 0.0008689584836341853
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "rougeLsum_fmeasure": 0.008943947340808445,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "rougeLsum_fmeasure_stderr": 0.0009064353361518755
+    },
+    {
+      "task_name": "GEM/wiki_lingua_en",
+      "prompt_name": "tldr_en",
+      "bleu": 1.5486400435190646e-20,
+      "dataset_path": "GEM/wiki_lingua",
+      "dataset_name": "en",
+      "subset": null,
+      "bleu_stderr": 5.928974165985541e-18
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json
new file mode 100644
index 0000000000000000000000000000000000000000..9b3a5dbe8c1b0ad4f5eb80b417f31411772f0cc7
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_4.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.06536803730199292,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0040080446277792475
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0562578647719299,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.003466710519052521
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.05638135159346031,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0033862796262482922
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.013453387419618108,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0014082914388032289
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.012484556808642255,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0012859869239756115
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.01218356666574454,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0012325430366165824
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.049695294130091966,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0031067522139507655
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.04191724945748147,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0026229390757436582
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.042197213021247744,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0025684294584781297
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0501407378152416,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.00312422462775062
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.04244935232602974,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0026544789695008624
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.042656324273577836,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.002590374442922286
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.114133025618453,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.03902166340342278
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 4,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json
new file mode 100644
index 0000000000000000000000000000000000000000..69629d02d9fb3e58d19a500f27832b84be9a10f0
--- /dev/null
+++ b/evaluation/generation/slim.lm1-2b8-55b-oscarroots_gem_xsum_article_DOC_summary_5.json
@@ -0,0 +1,133 @@
+{
+  "results": [
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge1_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge1_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rouge2_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rouge2_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeL_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeL_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_precision": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_precision_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_recall": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_recall_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "rougeLsum_fmeasure": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "rougeLsum_fmeasure_stderr": 0.0
+    },
+    {
+      "task_name": "gem_xsum",
+      "prompt_name": "article_DOC_summary",
+      "bleu": 0.0,
+      "dataset_path": "GEM/xsum",
+      "dataset_name": null,
+      "subset": "",
+      "bleu_stderr": 0.0
+    }
+  ],
+  "config": {
+    "model": "hf-causal",
+    "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-2b8-55b-oscarroots/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16",
+    "task_args": "",
+    "num_fewshot": 5,
+    "batch_size": 16,
+    "device": "cuda",
+    "use_cache": false,
+    "limit": 3000,
+    "bootstrap_iters": 10,
+    "seed": 1234
+  }
+}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9a783151c99e2fb57fd6bed4ac0a4a64dd09a014
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.322,0.014782913600996673,0
+anli_r2,acc,0.331,0.014888272588203936,0
+anli_r3,acc,0.3358333333333333,0.013639261190932887,0
+arc_challenge,acc,0.2380546075085324,0.012445770028026208,0
+arc_challenge,acc_norm,0.2627986348122867,0.012862523175351335,0
+arc_easy,acc,0.5513468013468014,0.010205540414612862,0
+arc_easy,acc_norm,0.49326599326599324,0.010258852980991825,0
+boolq,acc,0.5850152905198777,0.008617716361921567,1
+cb,acc,0.35714285714285715,0.0646095738380922,1
+cb,f1,0.24888576120103215,,1
+copa,acc,0.68,0.046882617226215034,0
+hellaswag,acc,0.38169687313284206,0.004848099661619686,0
+hellaswag,acc_norm,0.47689703246365267,0.004984452002563925,0
+piqa,acc,0.7121871599564744,0.01056325038305919,0
+piqa,acc_norm,0.7094668117519043,0.010592765034696534,0
+rte,acc,0.5270758122743683,0.030052303463143706,0
+sciq,acc,0.804,0.012559527926707378,0
+sciq,acc_norm,0.722,0.014174516461485247,0
+storycloze_2016,acc,0.6654195617316943,0.01091131896712794,0
+winogrande,acc,0.5138121546961326,0.014047122916440415,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json
index d743475ed42e665cc658484b0fffc490d74d9170..8a11443b4b309ef0fcdb477171d9a075d2f44646 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0.json
@@ -20,6 +20,52 @@
         "copa": {
             "acc": 0.68,
             "acc_stderr": 0.046882617226215034
+        },
+        "hellaswag": {
+            "acc": 0.38169687313284206,
+            "acc_stderr": 0.004848099661619686,
+            "acc_norm": 0.47689703246365267,
+            "acc_norm_stderr": 0.004984452002563925
+        },
+        "rte": {
+            "acc": 0.5270758122743683,
+            "acc_stderr": 0.030052303463143706
+        },
+        "winogrande": {
+            "acc": 0.5138121546961326,
+            "acc_stderr": 0.014047122916440415
+        },
+        "storycloze_2016": {
+            "acc": 0.6654195617316943,
+            "acc_stderr": 0.01091131896712794
+        },
+        "boolq": {
+            "acc": 0.5850152905198777,
+            "acc_stderr": 0.008617716361921567
+        },
+        "arc_easy": {
+            "acc": 0.5513468013468014,
+            "acc_stderr": 0.010205540414612862,
+            "acc_norm": 0.49326599326599324,
+            "acc_norm_stderr": 0.010258852980991825
+        },
+        "arc_challenge": {
+            "acc": 0.2380546075085324,
+            "acc_stderr": 0.012445770028026208,
+            "acc_norm": 0.2627986348122867,
+            "acc_norm_stderr": 0.012862523175351335
+        },
+        "sciq": {
+            "acc": 0.804,
+            "acc_stderr": 0.012559527926707378,
+            "acc_norm": 0.722,
+            "acc_norm_stderr": 0.014174516461485247
+        },
+        "piqa": {
+            "acc": 0.7121871599564744,
+            "acc_stderr": 0.01056325038305919,
+            "acc_norm": 0.7094668117519043,
+            "acc_norm_stderr": 0.010592765034696534
         }
     },
     "versions": {
@@ -27,6 +73,15 @@
         "anli_r2": 0,
         "anli_r3": 0,
         "cb": 1,
-        "copa": 0
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json
deleted file mode 100644
index d743475ed42e665cc658484b0fffc490d74d9170..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_0_lm-eval_global_step52452_2023-02-25-11-16-27_0shots_backup.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.322,
-            "acc_stderr": 0.014782913600996673
-        },
-        "anli_r2": {
-            "acc": 0.331,
-            "acc_stderr": 0.014888272588203936
-        },
-        "anli_r3": {
-            "acc": 0.3358333333333333,
-            "acc_stderr": 0.013639261190932887
-        },
-        "cb": {
-            "acc": 0.35714285714285715,
-            "acc_stderr": 0.0646095738380922,
-            "f1": 0.24888576120103215
-        },
-        "copa": {
-            "acc": 0.68,
-            "acc_stderr": 0.046882617226215034
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0,
-        "anli_r3": 0,
-        "cb": 1,
-        "copa": 0
-    }
-}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv
new file mode 100644
index 0000000000000000000000000000000000000000..ba5877a90a5786fc5a06117d5d75a35c4e2bf3c3
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.303,0.014539683710535255,0
+anli_r2,acc,0.319,0.014746404865473486,0
+anli_r3,acc,0.33416666666666667,0.013622434813136781,0
+arc_challenge,acc,0.2431740614334471,0.012536554144587092,0
+arc_challenge,acc_norm,0.28071672354948807,0.013131238126975583,0
+arc_easy,acc,0.5631313131313131,0.010177672928157681,0
+arc_easy,acc_norm,0.5273569023569024,0.010244415164390529,0
+boolq,acc,0.5813455657492355,0.008628545022868554,1
+cb,acc,0.48214285714285715,0.06737697508644648,1
+cb,f1,0.3270348837209302,,1
+copa,acc,0.66,0.04760952285695237,0
+hellaswag,acc,0.3815972913762199,0.004847857546957478,0
+hellaswag,acc_norm,0.477096195976897,0.004984543540932335,0
+piqa,acc,0.7078346028291621,0.010610252174513658,0
+piqa,acc_norm,0.6996735582154516,0.010695225308183145,0
+rte,acc,0.5306859205776173,0.03003973059219781,0
+sciq,acc,0.871,0.010605256784796565,0
+sciq,acc_norm,0.861,0.010945263761042963,0
+storycloze_2016,acc,0.655264564404062,0.01099083028205749,0
+winogrande,acc,0.5414364640883977,0.0140041468537919,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json
index 1d45ddcd822d1d31b6bd83c272c1975e920a8c59..d0f4fde80498d2b81785a034bd67447b961ac856 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1.json
@@ -20,6 +20,52 @@
         "copa": {
             "acc": 0.66,
             "acc_stderr": 0.04760952285695237
+        },
+        "hellaswag": {
+            "acc": 0.3815972913762199,
+            "acc_stderr": 0.004847857546957478,
+            "acc_norm": 0.477096195976897,
+            "acc_norm_stderr": 0.004984543540932335
+        },
+        "rte": {
+            "acc": 0.5306859205776173,
+            "acc_stderr": 0.03003973059219781
+        },
+        "winogrande": {
+            "acc": 0.5414364640883977,
+            "acc_stderr": 0.0140041468537919
+        },
+        "storycloze_2016": {
+            "acc": 0.655264564404062,
+            "acc_stderr": 0.01099083028205749
+        },
+        "boolq": {
+            "acc": 0.5813455657492355,
+            "acc_stderr": 0.008628545022868554
+        },
+        "arc_easy": {
+            "acc": 0.5631313131313131,
+            "acc_stderr": 0.010177672928157681,
+            "acc_norm": 0.5273569023569024,
+            "acc_norm_stderr": 0.010244415164390529
+        },
+        "arc_challenge": {
+            "acc": 0.2431740614334471,
+            "acc_stderr": 0.012536554144587092,
+            "acc_norm": 0.28071672354948807,
+            "acc_norm_stderr": 0.013131238126975583
+        },
+        "sciq": {
+            "acc": 0.871,
+            "acc_stderr": 0.010605256784796565,
+            "acc_norm": 0.861,
+            "acc_norm_stderr": 0.010945263761042963
+        },
+        "piqa": {
+            "acc": 0.7078346028291621,
+            "acc_stderr": 0.010610252174513658,
+            "acc_norm": 0.6996735582154516,
+            "acc_norm_stderr": 0.010695225308183145
         }
     },
     "versions": {
@@ -27,6 +73,15 @@
         "anli_r2": 0,
         "anli_r3": 0,
         "cb": 1,
-        "copa": 0
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json
deleted file mode 100644
index 1d45ddcd822d1d31b6bd83c272c1975e920a8c59..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_1_lm-eval_global_step52452_2023-02-25-11-18-29_1shots_backup.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.303,
-            "acc_stderr": 0.014539683710535255
-        },
-        "anli_r2": {
-            "acc": 0.319,
-            "acc_stderr": 0.014746404865473486
-        },
-        "anli_r3": {
-            "acc": 0.33416666666666667,
-            "acc_stderr": 0.013622434813136781
-        },
-        "cb": {
-            "acc": 0.48214285714285715,
-            "acc_stderr": 0.06737697508644648,
-            "f1": 0.3270348837209302
-        },
-        "copa": {
-            "acc": 0.66,
-            "acc_stderr": 0.04760952285695237
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0,
-        "anli_r3": 0,
-        "cb": 1,
-        "copa": 0
-    }
-}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv
new file mode 100644
index 0000000000000000000000000000000000000000..9ae892f5463c3edbcd6e22e2840c92a7a4f03547
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.311,0.014645596385722694,0
+anli_r2,acc,0.346,0.01505026612756444,0
+anli_r3,acc,0.325,0.013526454480351014,0
+arc_challenge,acc,0.24573378839590443,0.012581033453730114,0
+arc_challenge,acc_norm,0.27986348122866894,0.013119040897725922,0
+arc_easy,acc,0.5744949494949495,0.010145271182591021,0
+arc_easy,acc_norm,0.5471380471380471,0.010214087372211392,0
+boolq,acc,0.5629969418960244,0.008675365793227084,1
+cb,acc,0.4107142857142857,0.0663363415035954,1
+cb,f1,0.26927814732692784,,1
+copa,acc,0.68,0.04688261722621505,0
+hellaswag,acc,0.38149770961959767,0.00484761521647344,0
+hellaswag,acc_norm,0.4757020513841864,0.004983886091690525,0
+piqa,acc,0.7154515778019587,0.010527218464130605,0
+piqa,acc_norm,0.7105549510337323,0.01058101474067561,0
+rte,acc,0.5379061371841155,0.03000984891252912,0
+sciq,acc,0.885,0.010093407594904628,0
+sciq,acc_norm,0.88,0.010281328012747384,0
+storycloze_2016,acc,0.6547300908605024,0.010994860223187675,0
+winogrande,acc,0.5272296764009471,0.014031631629827696,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json
index d081bc3b334843d38818453d028bc3987662ad70..c4d124763162d14a87ad9d0e0d4c23ca1cd2efc9 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2.json
@@ -20,6 +20,52 @@
         "copa": {
             "acc": 0.68,
             "acc_stderr": 0.04688261722621505
+        },
+        "hellaswag": {
+            "acc": 0.38149770961959767,
+            "acc_stderr": 0.00484761521647344,
+            "acc_norm": 0.4757020513841864,
+            "acc_norm_stderr": 0.004983886091690525
+        },
+        "rte": {
+            "acc": 0.5379061371841155,
+            "acc_stderr": 0.03000984891252912
+        },
+        "winogrande": {
+            "acc": 0.5272296764009471,
+            "acc_stderr": 0.014031631629827696
+        },
+        "storycloze_2016": {
+            "acc": 0.6547300908605024,
+            "acc_stderr": 0.010994860223187675
+        },
+        "boolq": {
+            "acc": 0.5629969418960244,
+            "acc_stderr": 0.008675365793227084
+        },
+        "arc_easy": {
+            "acc": 0.5744949494949495,
+            "acc_stderr": 0.010145271182591021,
+            "acc_norm": 0.5471380471380471,
+            "acc_norm_stderr": 0.010214087372211392
+        },
+        "arc_challenge": {
+            "acc": 0.24573378839590443,
+            "acc_stderr": 0.012581033453730114,
+            "acc_norm": 0.27986348122866894,
+            "acc_norm_stderr": 0.013119040897725922
+        },
+        "sciq": {
+            "acc": 0.885,
+            "acc_stderr": 0.010093407594904628,
+            "acc_norm": 0.88,
+            "acc_norm_stderr": 0.010281328012747384
+        },
+        "piqa": {
+            "acc": 0.7154515778019587,
+            "acc_stderr": 0.010527218464130605,
+            "acc_norm": 0.7105549510337323,
+            "acc_norm_stderr": 0.01058101474067561
         }
     },
     "versions": {
@@ -27,6 +73,15 @@
         "anli_r2": 0,
         "anli_r3": 0,
         "cb": 1,
-        "copa": 0
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json
deleted file mode 100644
index d081bc3b334843d38818453d028bc3987662ad70..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_2_lm-eval_global_step52452_2023-02-25-11-18-29_2shots_backup.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.311,
-            "acc_stderr": 0.014645596385722694
-        },
-        "anli_r2": {
-            "acc": 0.346,
-            "acc_stderr": 0.01505026612756444
-        },
-        "anli_r3": {
-            "acc": 0.325,
-            "acc_stderr": 0.013526454480351014
-        },
-        "cb": {
-            "acc": 0.4107142857142857,
-            "acc_stderr": 0.0663363415035954,
-            "f1": 0.26927814732692784
-        },
-        "copa": {
-            "acc": 0.68,
-            "acc_stderr": 0.04688261722621505
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0,
-        "anli_r3": 0,
-        "cb": 1,
-        "copa": 0
-    }
-}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv
new file mode 100644
index 0000000000000000000000000000000000000000..1e192117c28be80ce1cb36706c50622a3ce92381
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.316,0.014709193056057134,0
+anli_r2,acc,0.344,0.015029633724408945,0
+anli_r3,acc,0.335,0.013630871843821482,0
+arc_challenge,acc,0.23720136518771331,0.012430399829260844,0
+arc_challenge,acc_norm,0.2883959044368601,0.01323839442242817,0
+arc_easy,acc,0.5698653198653199,0.010159130445178502,0
+arc_easy,acc_norm,0.5517676767676768,0.010204645126856942,0
+boolq,acc,0.5660550458715596,0.008668405003744127,1
+cb,acc,0.48214285714285715,0.0673769750864465,1
+cb,f1,0.32495309568480296,,1
+copa,acc,0.68,0.04688261722621504,0
+hellaswag,acc,0.38179645488946423,0.004848341560492138,0
+hellaswag,acc_norm,0.4785899223262298,0.004985204766555062,0
+piqa,acc,0.7187159956474428,0.010490509832327423,0
+piqa,acc_norm,0.7127312295973884,0.010557291761528637,0
+rte,acc,0.5054151624548736,0.030094698123239966,0
+sciq,acc,0.882,0.010206869264381791,0
+sciq,acc_norm,0.879,0.010318210380946097,0
+storycloze_2016,acc,0.6541956173169428,0.010998874799044323,0
+winogrande,acc,0.5288082083662194,0.014029141615909617,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json
index 97521dea31f26287cb5e405d1e108098872f9deb..bb08b35b8b95430d58ebd882b84d5709f210d88d 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3.json
@@ -20,6 +20,52 @@
         "copa": {
             "acc": 0.68,
             "acc_stderr": 0.04688261722621504
+        },
+        "hellaswag": {
+            "acc": 0.38179645488946423,
+            "acc_stderr": 0.004848341560492138,
+            "acc_norm": 0.4785899223262298,
+            "acc_norm_stderr": 0.004985204766555062
+        },
+        "rte": {
+            "acc": 0.5054151624548736,
+            "acc_stderr": 0.030094698123239966
+        },
+        "winogrande": {
+            "acc": 0.5288082083662194,
+            "acc_stderr": 0.014029141615909617
+        },
+        "storycloze_2016": {
+            "acc": 0.6541956173169428,
+            "acc_stderr": 0.010998874799044323
+        },
+        "boolq": {
+            "acc": 0.5660550458715596,
+            "acc_stderr": 0.008668405003744127
+        },
+        "arc_easy": {
+            "acc": 0.5698653198653199,
+            "acc_stderr": 0.010159130445178502,
+            "acc_norm": 0.5517676767676768,
+            "acc_norm_stderr": 0.010204645126856942
+        },
+        "arc_challenge": {
+            "acc": 0.23720136518771331,
+            "acc_stderr": 0.012430399829260844,
+            "acc_norm": 0.2883959044368601,
+            "acc_norm_stderr": 0.01323839442242817
+        },
+        "sciq": {
+            "acc": 0.882,
+            "acc_stderr": 0.010206869264381791,
+            "acc_norm": 0.879,
+            "acc_norm_stderr": 0.010318210380946097
+        },
+        "piqa": {
+            "acc": 0.7187159956474428,
+            "acc_stderr": 0.010490509832327423,
+            "acc_norm": 0.7127312295973884,
+            "acc_norm_stderr": 0.010557291761528637
         }
     },
     "versions": {
@@ -27,6 +73,15 @@
         "anli_r2": 0,
         "anli_r3": 0,
         "cb": 1,
-        "copa": 0
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json
deleted file mode 100644
index 97521dea31f26287cb5e405d1e108098872f9deb..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_3_lm-eval_global_step52452_2023-02-25-11-18-29_3shots_backup.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.316,
-            "acc_stderr": 0.014709193056057134
-        },
-        "anli_r2": {
-            "acc": 0.344,
-            "acc_stderr": 0.015029633724408945
-        },
-        "anli_r3": {
-            "acc": 0.335,
-            "acc_stderr": 0.013630871843821482
-        },
-        "cb": {
-            "acc": 0.48214285714285715,
-            "acc_stderr": 0.0673769750864465,
-            "f1": 0.32495309568480296
-        },
-        "copa": {
-            "acc": 0.68,
-            "acc_stderr": 0.04688261722621504
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0,
-        "anli_r3": 0,
-        "cb": 1,
-        "copa": 0
-    }
-}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv
new file mode 100644
index 0000000000000000000000000000000000000000..7a2243ef0b1887e20cfd4e3ddea65497a2fe5130
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.34,0.014987482264363935,0
+anli_r2,acc,0.339,0.01497675877162034,0
+anli_r3,acc,0.33416666666666667,0.01362243481313678,0
+arc_challenge,acc,0.24061433447098976,0.012491468532390573,0
+arc_challenge,acc_norm,0.27303754266211605,0.013019332762635743,0
+arc_easy,acc,0.5732323232323232,0.010149141043955636,0
+arc_easy,acc_norm,0.5589225589225589,0.010188293221040564,0
+boolq,acc,0.5519877675840978,0.008697655510897233,1
+cb,acc,0.4642857142857143,0.06724777654937658,1
+cb,f1,0.3085858585858586,,1
+copa,acc,0.73,0.0446196043338474,0
+hellaswag,acc,0.38149770961959767,0.0048476152164734386,0
+hellaswag,acc_norm,0.47938657637920734,0.004985539159783411,0
+piqa,acc,0.7067464635473341,0.010621818421101924,0
+piqa,acc_norm,0.704570184983678,0.010644731559342467,0
+rte,acc,0.5090252707581228,0.030091559826331334,0
+sciq,acc,0.899,0.00953361892934102,0
+sciq,acc_norm,0.902,0.00940661918462123,0
+storycloze_2016,acc,0.6536611437733832,0.01100287402644642,0
+winogrande,acc,0.5295974743488555,0.014027843827840086,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json
index 54d014eb10da0c80fac0ac75c3fc41fcef7b7980..58b11ab206673db71a0b7816c7254f8c164c01ba 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4.json
@@ -20,6 +20,52 @@
         "copa": {
             "acc": 0.73,
             "acc_stderr": 0.0446196043338474
+        },
+        "hellaswag": {
+            "acc": 0.38149770961959767,
+            "acc_stderr": 0.0048476152164734386,
+            "acc_norm": 0.47938657637920734,
+            "acc_norm_stderr": 0.004985539159783411
+        },
+        "rte": {
+            "acc": 0.5090252707581228,
+            "acc_stderr": 0.030091559826331334
+        },
+        "winogrande": {
+            "acc": 0.5295974743488555,
+            "acc_stderr": 0.014027843827840086
+        },
+        "storycloze_2016": {
+            "acc": 0.6536611437733832,
+            "acc_stderr": 0.01100287402644642
+        },
+        "boolq": {
+            "acc": 0.5519877675840978,
+            "acc_stderr": 0.008697655510897233
+        },
+        "arc_easy": {
+            "acc": 0.5732323232323232,
+            "acc_stderr": 0.010149141043955636,
+            "acc_norm": 0.5589225589225589,
+            "acc_norm_stderr": 0.010188293221040564
+        },
+        "arc_challenge": {
+            "acc": 0.24061433447098976,
+            "acc_stderr": 0.012491468532390573,
+            "acc_norm": 0.27303754266211605,
+            "acc_norm_stderr": 0.013019332762635743
+        },
+        "sciq": {
+            "acc": 0.899,
+            "acc_stderr": 0.00953361892934102,
+            "acc_norm": 0.902,
+            "acc_norm_stderr": 0.00940661918462123
+        },
+        "piqa": {
+            "acc": 0.7067464635473341,
+            "acc_stderr": 0.010621818421101924,
+            "acc_norm": 0.704570184983678,
+            "acc_norm_stderr": 0.010644731559342467
         }
     },
     "versions": {
@@ -27,6 +73,15 @@
         "anli_r2": 0,
         "anli_r3": 0,
         "cb": 1,
-        "copa": 0
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json
deleted file mode 100644
index 54d014eb10da0c80fac0ac75c3fc41fcef7b7980..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_4_lm-eval_global_step52452_2023-02-25-11-16-27_4shots_backup.json
+++ /dev/null
@@ -1,32 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.34,
-            "acc_stderr": 0.014987482264363935
-        },
-        "anli_r2": {
-            "acc": 0.339,
-            "acc_stderr": 0.01497675877162034
-        },
-        "anli_r3": {
-            "acc": 0.33416666666666667,
-            "acc_stderr": 0.01362243481313678
-        },
-        "cb": {
-            "acc": 0.4642857142857143,
-            "acc_stderr": 0.06724777654937658,
-            "f1": 0.3085858585858586
-        },
-        "copa": {
-            "acc": 0.73,
-            "acc_stderr": 0.0446196043338474
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0,
-        "anli_r3": 0,
-        "cb": 1,
-        "copa": 0
-    }
-}
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv
new file mode 100644
index 0000000000000000000000000000000000000000..df7e4a65b658b9bfc344204a72368adee3feadca
--- /dev/null
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.csv
@@ -0,0 +1,21 @@
+task,metric,value,err,version
+anli_r1,acc,0.339,0.014976758771620342,0
+anli_r2,acc,0.347,0.015060472031706615,0
+anli_r3,acc,0.3425,0.013704669762934722,0
+arc_challenge,acc,0.2551194539249147,0.012739038695202102,0
+arc_challenge,acc_norm,0.26621160409556316,0.012915774781523214,0
+arc_easy,acc,0.5711279461279462,0.010155440652900154,0
+arc_easy,acc_norm,0.5467171717171717,0.010214901516731609,0
+boolq,acc,0.5547400611620795,0.008692488322023063,1
+cb,acc,0.375,0.06527912098338669,1
+cb,f1,0.2429169746242917,,1
+copa,acc,0.7,0.046056618647183814,0
+hellaswag,acc,0.3819956184027086,0.004848824710995933,0
+hellaswag,acc_norm,0.483469428400717,0.0049870536525402735,0
+piqa,acc,0.6953210010881393,0.010738889044325161,0
+piqa,acc_norm,0.7029379760609358,0.010661725404814783,0
+rte,acc,0.5631768953068592,0.02985524739031494,0
+sciq,acc,0.899,0.009533618929340995,0
+sciq,acc_norm,0.903,0.009363689373248121,0
+storycloze_2016,acc,0.6606092998396579,0.010949682016358629,0
+winogrande,acc,0.526440410418311,0.014032823874407225,0
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json
index 01defb03f69355c1ce57dae755114fd90728e8ab..2377bf3a2d94368d8b1042c07d3a55a146f0809d 100644
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json
+++ b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5.json
@@ -7,10 +7,81 @@
         "anli_r2": {
             "acc": 0.347,
             "acc_stderr": 0.015060472031706615
+        },
+        "anli_r3": {
+            "acc": 0.3425,
+            "acc_stderr": 0.013704669762934722
+        },
+        "cb": {
+            "acc": 0.375,
+            "acc_stderr": 0.06527912098338669,
+            "f1": 0.2429169746242917
+        },
+        "copa": {
+            "acc": 0.7,
+            "acc_stderr": 0.046056618647183814
+        },
+        "hellaswag": {
+            "acc": 0.3819956184027086,
+            "acc_stderr": 0.004848824710995933,
+            "acc_norm": 0.483469428400717,
+            "acc_norm_stderr": 0.0049870536525402735
+        },
+        "rte": {
+            "acc": 0.5631768953068592,
+            "acc_stderr": 0.02985524739031494
+        },
+        "winogrande": {
+            "acc": 0.526440410418311,
+            "acc_stderr": 0.014032823874407225
+        },
+        "storycloze_2016": {
+            "acc": 0.6606092998396579,
+            "acc_stderr": 0.010949682016358629
+        },
+        "boolq": {
+            "acc": 0.5547400611620795,
+            "acc_stderr": 0.008692488322023063
+        },
+        "arc_easy": {
+            "acc": 0.5711279461279462,
+            "acc_stderr": 0.010155440652900154,
+            "acc_norm": 0.5467171717171717,
+            "acc_norm_stderr": 0.010214901516731609
+        },
+        "arc_challenge": {
+            "acc": 0.2551194539249147,
+            "acc_stderr": 0.012739038695202102,
+            "acc_norm": 0.26621160409556316,
+            "acc_norm_stderr": 0.012915774781523214
+        },
+        "sciq": {
+            "acc": 0.899,
+            "acc_stderr": 0.009533618929340995,
+            "acc_norm": 0.903,
+            "acc_norm_stderr": 0.009363689373248121
+        },
+        "piqa": {
+            "acc": 0.6953210010881393,
+            "acc_stderr": 0.010738889044325161,
+            "acc_norm": 0.7029379760609358,
+            "acc_norm_stderr": 0.010661725404814783
         }
     },
     "versions": {
         "anli_r1": 0,
-        "anli_r2": 0
+        "anli_r2": 0,
+        "anli_r3": 0,
+        "cb": 1,
+        "copa": 0,
+        "hellaswag": 0,
+        "rte": 0,
+        "winogrande": 0,
+        "storycloze_2016": 0,
+        "boolq": 1,
+        "arc_easy": 0,
+        "arc_challenge": 0,
+        "sciq": 0,
+        "piqa": 0
     }
 }
\ No newline at end of file
diff --git a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json b/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json
deleted file mode 100644
index 01defb03f69355c1ce57dae755114fd90728e8ab..0000000000000000000000000000000000000000
--- a/evaluation/rankeval/lm1-2b8-55b-oscarroots_5_lm-eval_global_step52452_2023-02-25-11-16-27_5shots_backup.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-    "results": {
-        "anli_r1": {
-            "acc": 0.339,
-            "acc_stderr": 0.014976758771620342
-        },
-        "anli_r2": {
-            "acc": 0.347,
-            "acc_stderr": 0.015060472031706615
-        }
-    },
-    "versions": {
-        "anli_r1": 0,
-        "anli_r2": 0
-    }
-}
\ No newline at end of file